FreeBSD/Linux Kernel Cross Reference
sys/i386/i386/pmap.c
1 /*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu>
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department and William Jolitz of UUNET Technologies Inc.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. All advertising materials mentioning features or use of this software
24 * must display the following acknowledgement:
25 * This product includes software developed by the University of
26 * California, Berkeley and its contributors.
27 * 4. Neither the name of the University nor the names of its contributors
28 * may be used to endorse or promote products derived from this software
29 * without specific prior written permission.
30 *
31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
41 * SUCH DAMAGE.
42 *
43 * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
44 */
45 /*-
46 * Copyright (c) 2003 Networks Associates Technology, Inc.
47 * All rights reserved.
48 *
49 * This software was developed for the FreeBSD Project by Jake Burkholder,
50 * Safeport Network Services, and Network Associates Laboratories, the
51 * Security Research Division of Network Associates, Inc. under
52 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
53 * CHATS research program.
54 *
55 * Redistribution and use in source and binary forms, with or without
56 * modification, are permitted provided that the following conditions
57 * are met:
58 * 1. Redistributions of source code must retain the above copyright
59 * notice, this list of conditions and the following disclaimer.
60 * 2. Redistributions in binary form must reproduce the above copyright
61 * notice, this list of conditions and the following disclaimer in the
62 * documentation and/or other materials provided with the distribution.
63 *
64 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
65 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
66 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
67 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
68 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
69 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
70 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
71 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
72 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
73 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
74 * SUCH DAMAGE.
75 */
76
77 #include <sys/cdefs.h>
78 __FBSDID("$FreeBSD$");
79
80 /*
81 * Manages physical address maps.
82 *
83 * In addition to hardware address maps, this
84 * module is called upon to provide software-use-only
85 * maps which may or may not be stored in the same
86 * form as hardware maps. These pseudo-maps are
87 * used to store intermediate results from copy
88 * operations to and from address spaces.
89 *
90 * Since the information managed by this module is
91 * also stored by the logical address mapping module,
92 * this module may throw away valid virtual-to-physical
93 * mappings at almost any time. However, invalidations
94 * of virtual-to-physical mappings must be done as
95 * requested.
96 *
97 * In order to cope with hardware architectures which
98 * make virtual-to-physical map invalidates expensive,
99 * this module may delay invalidate or reduced protection
100 * operations until such time as they are actually
101 * necessary. This module is given full information as
102 * to which processors are currently using which maps,
103 * and to when physical maps must be made correct.
104 */
105
106 #include "opt_cpu.h"
107 #include "opt_pmap.h"
108 #include "opt_msgbuf.h"
109 #include "opt_smp.h"
110 #include "opt_xbox.h"
111
112 #include <sys/param.h>
113 #include <sys/systm.h>
114 #include <sys/kernel.h>
115 #include <sys/lock.h>
116 #include <sys/malloc.h>
117 #include <sys/mman.h>
118 #include <sys/msgbuf.h>
119 #include <sys/mutex.h>
120 #include <sys/proc.h>
121 #include <sys/sx.h>
122 #include <sys/vmmeter.h>
123 #include <sys/sched.h>
124 #include <sys/sysctl.h>
125 #ifdef SMP
126 #include <sys/smp.h>
127 #endif
128
129 #include <vm/vm.h>
130 #include <vm/vm_param.h>
131 #include <vm/vm_kern.h>
132 #include <vm/vm_page.h>
133 #include <vm/vm_map.h>
134 #include <vm/vm_object.h>
135 #include <vm/vm_extern.h>
136 #include <vm/vm_pageout.h>
137 #include <vm/vm_pager.h>
138 #include <vm/uma.h>
139
140 #include <machine/cpu.h>
141 #include <machine/cputypes.h>
142 #include <machine/md_var.h>
143 #include <machine/pcb.h>
144 #include <machine/specialreg.h>
145 #ifdef SMP
146 #include <machine/smp.h>
147 #endif
148
149 #ifdef XBOX
150 #include <machine/xbox.h>
151 #endif
152
153 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
154 #define CPU_ENABLE_SSE
155 #endif
156
157 #ifndef PMAP_SHPGPERPROC
158 #define PMAP_SHPGPERPROC 200
159 #endif
160
161 #if defined(DIAGNOSTIC)
162 #define PMAP_DIAGNOSTIC
163 #endif
164
165 #if !defined(PMAP_DIAGNOSTIC)
166 #define PMAP_INLINE __gnu89_inline
167 #else
168 #define PMAP_INLINE
169 #endif
170
171 #define PV_STATS
172 #ifdef PV_STATS
173 #define PV_STAT(x) do { x ; } while (0)
174 #else
175 #define PV_STAT(x) do { } while (0)
176 #endif
177
178 /*
179 * Get PDEs and PTEs for user/kernel address space
180 */
181 #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
182 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
183
184 #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0)
185 #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0)
186 #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0)
187 #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0)
188 #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0)
189
190 #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
191 atomic_clear_int((u_int *)(pte), PG_W))
192 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
193
194 struct pmap kernel_pmap_store;
195 LIST_HEAD(pmaplist, pmap);
196 static struct pmaplist allpmaps;
197 static struct mtx allpmaps_lock;
198
199 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
200 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
201 int pgeflag = 0; /* PG_G or-in */
202 int pseflag = 0; /* PG_PS or-in */
203
204 static int nkpt;
205 vm_offset_t kernel_vm_end;
206 extern u_int32_t KERNend;
207
208 #ifdef PAE
209 pt_entry_t pg_nx;
210 static uma_zone_t pdptzone;
211 #endif
212
213 /*
214 * Data for the pv entry allocation mechanism
215 */
216 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
217 static int shpgperproc = PMAP_SHPGPERPROC;
218
219 struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */
220 int pv_maxchunks; /* How many chunks we have KVA for */
221 vm_offset_t pv_vafree; /* freelist stored in the PTE */
222
223 /*
224 * All those kernel PT submaps that BSD is so fond of
225 */
226 struct sysmaps {
227 struct mtx lock;
228 pt_entry_t *CMAP1;
229 pt_entry_t *CMAP2;
230 caddr_t CADDR1;
231 caddr_t CADDR2;
232 };
233 static struct sysmaps sysmaps_pcpu[MAXCPU];
234 pt_entry_t *CMAP1 = 0;
235 static pt_entry_t *CMAP3;
236 caddr_t CADDR1 = 0, ptvmmap = 0;
237 static caddr_t CADDR3;
238 struct msgbuf *msgbufp = 0;
239
240 /*
241 * Crashdump maps.
242 */
243 static caddr_t crashdumpmap;
244
245 #ifdef SMP
246 extern pt_entry_t *SMPpt;
247 #endif
248 static pt_entry_t *PMAP1 = 0, *PMAP2;
249 static pt_entry_t *PADDR1 = 0, *PADDR2;
250 #ifdef SMP
251 static int PMAP1cpu;
252 static int PMAP1changedcpu;
253 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD,
254 &PMAP1changedcpu, 0,
255 "Number of times pmap_pte_quick changed CPU with same PMAP1");
256 #endif
257 static int PMAP1changed;
258 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD,
259 &PMAP1changed, 0,
260 "Number of times pmap_pte_quick changed PMAP1");
261 static int PMAP1unchanged;
262 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD,
263 &PMAP1unchanged, 0,
264 "Number of times pmap_pte_quick didn't change PMAP1");
265 static struct mtx PMAP2mutex;
266
267 static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
268 static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
269
270 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
271 vm_page_t m, vm_prot_t prot, vm_page_t mpte);
272 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
273 vm_page_t *free);
274 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
275 vm_page_t *free);
276 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
277 vm_offset_t va);
278 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
279 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
280 vm_page_t m);
281
282 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
283
284 static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags);
285 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free);
286 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
287 static void pmap_pte_release(pt_entry_t *pte);
288 static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *);
289 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
290 #ifdef PAE
291 static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
292 #endif
293
294 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
295 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
296
297 /*
298 * Move the kernel virtual free pointer to the next
299 * 4MB. This is used to help improve performance
300 * by using a large (4MB) page for much of the kernel
301 * (.text, .data, .bss)
302 */
303 static vm_offset_t
304 pmap_kmem_choose(vm_offset_t addr)
305 {
306 vm_offset_t newaddr = addr;
307
308 #ifndef DISABLE_PSE
309 if (cpu_feature & CPUID_PSE)
310 newaddr = (addr + PDRMASK) & ~PDRMASK;
311 #endif
312 return newaddr;
313 }
314
315 /*
316 * Bootstrap the system enough to run with virtual memory.
317 *
318 * On the i386 this is called after mapping has already been enabled
319 * and just syncs the pmap module with what has already been done.
320 * [We can't call it easily with mapping off since the kernel is not
321 * mapped with PA == VA, hence we would have to relocate every address
322 * from the linked base (virtual) address "KERNBASE" to the actual
323 * (physical) address starting relative to 0]
324 */
325 void
326 pmap_bootstrap(vm_paddr_t firstaddr)
327 {
328 vm_offset_t va;
329 pt_entry_t *pte, *unused;
330 struct sysmaps *sysmaps;
331 int i;
332
333 /*
334 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
335 * large. It should instead be correctly calculated in locore.s and
336 * not based on 'first' (which is a physical address, not a virtual
337 * address, for the start of unused physical memory). The kernel
338 * page tables are NOT double mapped and thus should not be included
339 * in this calculation.
340 */
341 virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
342 virtual_avail = pmap_kmem_choose(virtual_avail);
343
344 virtual_end = VM_MAX_KERNEL_ADDRESS;
345
346 /*
347 * Initialize the kernel pmap (which is statically allocated).
348 */
349 PMAP_LOCK_INIT(kernel_pmap);
350 kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
351 #ifdef PAE
352 kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
353 #endif
354 kernel_pmap->pm_active = -1; /* don't allow deactivation */
355 TAILQ_INIT(&kernel_pmap->pm_pvchunk);
356 LIST_INIT(&allpmaps);
357 mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
358 mtx_lock_spin(&allpmaps_lock);
359 LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
360 mtx_unlock_spin(&allpmaps_lock);
361 nkpt = NKPT;
362
363 /*
364 * Reserve some special page table entries/VA space for temporary
365 * mapping of pages.
366 */
367 #define SYSMAP(c, p, v, n) \
368 v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
369
370 va = virtual_avail;
371 pte = vtopte(va);
372
373 /*
374 * CMAP1/CMAP2 are used for zeroing and copying pages.
375 * CMAP3 is used for the idle process page zeroing.
376 */
377 for (i = 0; i < MAXCPU; i++) {
378 sysmaps = &sysmaps_pcpu[i];
379 mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
380 SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
381 SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
382 }
383 SYSMAP(caddr_t, CMAP1, CADDR1, 1)
384 SYSMAP(caddr_t, CMAP3, CADDR3, 1)
385 *CMAP3 = 0;
386
387 /*
388 * Crashdump maps.
389 */
390 SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
391
392 /*
393 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
394 */
395 SYSMAP(caddr_t, unused, ptvmmap, 1)
396
397 /*
398 * msgbufp is used to map the system message buffer.
399 */
400 SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
401
402 /*
403 * ptemap is used for pmap_pte_quick
404 */
405 SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1);
406 SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1);
407
408 mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
409
410 virtual_avail = va;
411
412 *CMAP1 = 0;
413
414 /*
415 * Leave in place an identity mapping (virt == phys) for the low 1 MB
416 * physical memory region that is used by the ACPI wakeup code. This
417 * mapping must not have PG_G set.
418 */
419 #ifdef XBOX
420 /* FIXME: This is gross, but needed for the XBOX. Since we are in such
421 * an early stadium, we cannot yet neatly map video memory ... :-(
422 * Better fixes are very welcome! */
423 if (!arch_i386_is_xbox)
424 #endif
425 for (i = 1; i < NKPT; i++)
426 PTD[i] = 0;
427
428 /* Initialize the PAT MSR if present. */
429 pmap_init_pat();
430
431 /* Turn on PG_G on kernel page(s) */
432 pmap_set_pg();
433 }
434
435 /*
436 * Setup the PAT MSR.
437 */
438 void
439 pmap_init_pat(void)
440 {
441 uint64_t pat_msr;
442
443 /* Bail if this CPU doesn't implement PAT. */
444 if (!(cpu_feature & CPUID_PAT))
445 return;
446
447 #ifdef PAT_WORKS
448 /*
449 * Leave the indices 0-3 at the default of WB, WT, UC, and UC-.
450 * Program 4 and 5 as WP and WC.
451 * Leave 6 and 7 as UC and UC-.
452 */
453 pat_msr = rdmsr(MSR_PAT);
454 pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5));
455 pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) |
456 PAT_VALUE(5, PAT_WRITE_COMBINING);
457 #else
458 /*
459 * Due to some Intel errata, we can only safely use the lower 4
460 * PAT entries. Thus, just replace PAT Index 2 with WC instead
461 * of UC-.
462 *
463 * Intel Pentium III Processor Specification Update
464 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
465 * or Mode C Paging)
466 *
467 * Intel Pentium IV Processor Specification Update
468 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
469 */
470 pat_msr = rdmsr(MSR_PAT);
471 pat_msr &= ~PAT_MASK(2);
472 pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
473 #endif
474 wrmsr(MSR_PAT, pat_msr);
475 }
476
477 /*
478 * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on.
479 */
480 void
481 pmap_set_pg(void)
482 {
483 pd_entry_t pdir;
484 pt_entry_t *pte;
485 vm_offset_t va, endva;
486 int i;
487
488 if (pgeflag == 0)
489 return;
490
491 i = KERNLOAD/NBPDR;
492 endva = KERNBASE + KERNend;
493
494 if (pseflag) {
495 va = KERNBASE + KERNLOAD;
496 while (va < endva) {
497 pdir = kernel_pmap->pm_pdir[KPTDI+i];
498 pdir |= pgeflag;
499 kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir;
500 invltlb(); /* Play it safe, invltlb() every time */
501 i++;
502 va += NBPDR;
503 }
504 } else {
505 va = (vm_offset_t)btext;
506 while (va < endva) {
507 pte = vtopte(va);
508 if (*pte)
509 *pte |= pgeflag;
510 invltlb(); /* Play it safe, invltlb() every time */
511 va += PAGE_SIZE;
512 }
513 }
514 }
515
516 /*
517 * Initialize a vm_page's machine-dependent fields.
518 */
519 void
520 pmap_page_init(vm_page_t m)
521 {
522
523 TAILQ_INIT(&m->md.pv_list);
524 m->md.pv_list_count = 0;
525 }
526
527 #ifdef PAE
528
529 static MALLOC_DEFINE(M_PMAPPDPT, "pmap", "pmap pdpt");
530
531 static void *
532 pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
533 {
534 *flags = UMA_SLAB_PRIV;
535 return (contigmalloc(PAGE_SIZE, M_PMAPPDPT, 0, 0x0ULL, 0xffffffffULL,
536 1, 0));
537 }
538 #endif
539
540 /*
541 * ABuse the pte nodes for unmapped kva to thread a kva freelist through.
542 * Requirements:
543 * - Must deal with pages in order to ensure that none of the PG_* bits
544 * are ever set, PG_V in particular.
545 * - Assumes we can write to ptes without pte_store() atomic ops, even
546 * on PAE systems. This should be ok.
547 * - Assumes nothing will ever test these addresses for 0 to indicate
548 * no mapping instead of correctly checking PG_V.
549 * - Assumes a vm_offset_t will fit in a pte (true for i386).
550 * Because PG_V is never set, there can be no mappings to invalidate.
551 */
552 static vm_offset_t
553 pmap_ptelist_alloc(vm_offset_t *head)
554 {
555 pt_entry_t *pte;
556 vm_offset_t va;
557
558 va = *head;
559 if (va == 0)
560 return (va); /* Out of memory */
561 pte = vtopte(va);
562 *head = *pte;
563 if (*head & PG_V)
564 panic("pmap_ptelist_alloc: va with PG_V set!");
565 *pte = 0;
566 return (va);
567 }
568
569 static void
570 pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
571 {
572 pt_entry_t *pte;
573
574 if (va & PG_V)
575 panic("pmap_ptelist_free: freeing va with PG_V set!");
576 pte = vtopte(va);
577 *pte = *head; /* virtual! PG_V is 0 though */
578 *head = va;
579 }
580
581 static void
582 pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
583 {
584 int i;
585 vm_offset_t va;
586
587 *head = 0;
588 for (i = npages - 1; i >= 0; i--) {
589 va = (vm_offset_t)base + i * PAGE_SIZE;
590 pmap_ptelist_free(head, va);
591 }
592 }
593
594
595 /*
596 * Initialize the pmap module.
597 * Called by vm_init, to initialize any structures that the pmap
598 * system needs to map virtual memory.
599 */
600 void
601 pmap_init(void)
602 {
603
604 /*
605 * Initialize the address space (zone) for the pv entries. Set a
606 * high water mark so that the system can recover from excessive
607 * numbers of pv entries.
608 */
609 TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
610 pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
611 TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
612 pv_entry_max = roundup(pv_entry_max, _NPCPV);
613 pv_entry_high_water = 9 * (pv_entry_max / 10);
614
615 pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
616 pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map,
617 PAGE_SIZE * pv_maxchunks);
618 if (pv_chunkbase == NULL)
619 panic("pmap_init: not enough kvm for pv chunks");
620 pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
621 #ifdef PAE
622 pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
623 NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
624 UMA_ZONE_VM | UMA_ZONE_NOFREE);
625 uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
626 #endif
627 }
628
629
630 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
631 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
632 "Max number of PV entries");
633 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
634 "Page share factor per proc");
635
636 /***************************************************
637 * Low level helper routines.....
638 ***************************************************/
639
640 /*
641 * Determine the appropriate bits to set in a PTE or PDE for a specified
642 * caching mode.
643 */
644 static int
645 pmap_cache_bits(int mode, boolean_t is_pde)
646 {
647 int pat_flag, pat_index, cache_bits;
648
649 /* The PAT bit is different for PTE's and PDE's. */
650 pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
651
652 /* If we don't support PAT, map extended modes to older ones. */
653 if (!(cpu_feature & CPUID_PAT)) {
654 switch (mode) {
655 case PAT_UNCACHEABLE:
656 case PAT_WRITE_THROUGH:
657 case PAT_WRITE_BACK:
658 break;
659 case PAT_UNCACHED:
660 case PAT_WRITE_COMBINING:
661 case PAT_WRITE_PROTECTED:
662 mode = PAT_UNCACHEABLE;
663 break;
664 }
665 }
666
667 /* Map the caching mode to a PAT index. */
668 switch (mode) {
669 #ifdef PAT_WORKS
670 case PAT_UNCACHEABLE:
671 pat_index = 3;
672 break;
673 case PAT_WRITE_THROUGH:
674 pat_index = 1;
675 break;
676 case PAT_WRITE_BACK:
677 pat_index = 0;
678 break;
679 case PAT_UNCACHED:
680 pat_index = 2;
681 break;
682 case PAT_WRITE_COMBINING:
683 pat_index = 5;
684 break;
685 case PAT_WRITE_PROTECTED:
686 pat_index = 4;
687 break;
688 #else
689 case PAT_UNCACHED:
690 case PAT_UNCACHEABLE:
691 case PAT_WRITE_PROTECTED:
692 pat_index = 3;
693 break;
694 case PAT_WRITE_THROUGH:
695 pat_index = 1;
696 break;
697 case PAT_WRITE_BACK:
698 pat_index = 0;
699 break;
700 case PAT_WRITE_COMBINING:
701 pat_index = 2;
702 break;
703 #endif
704 default:
705 panic("Unknown caching mode %d\n", mode);
706 }
707
708 /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
709 cache_bits = 0;
710 if (pat_index & 0x4)
711 cache_bits |= pat_flag;
712 if (pat_index & 0x2)
713 cache_bits |= PG_NC_PCD;
714 if (pat_index & 0x1)
715 cache_bits |= PG_NC_PWT;
716 return (cache_bits);
717 }
718 #ifdef SMP
719 /*
720 * For SMP, these functions have to use the IPI mechanism for coherence.
721 *
722 * N.B.: Before calling any of the following TLB invalidation functions,
723 * the calling processor must ensure that all stores updating a non-
724 * kernel page table are globally performed. Otherwise, another
725 * processor could cache an old, pre-update entry without being
726 * invalidated. This can happen one of two ways: (1) The pmap becomes
727 * active on another processor after its pm_active field is checked by
728 * one of the following functions but before a store updating the page
729 * table is globally performed. (2) The pmap becomes active on another
730 * processor before its pm_active field is checked but due to
731 * speculative loads one of the following functions stills reads the
732 * pmap as inactive on the other processor.
733 *
734 * The kernel page table is exempt because its pm_active field is
735 * immutable. The kernel page table is always active on every
736 * processor.
737 */
738 void
739 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
740 {
741 u_int cpumask;
742 u_int other_cpus;
743
744 sched_pin();
745 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
746 invlpg(va);
747 smp_invlpg(va);
748 } else {
749 cpumask = PCPU_GET(cpumask);
750 other_cpus = PCPU_GET(other_cpus);
751 if (pmap->pm_active & cpumask)
752 invlpg(va);
753 if (pmap->pm_active & other_cpus)
754 smp_masked_invlpg(pmap->pm_active & other_cpus, va);
755 }
756 sched_unpin();
757 }
758
759 void
760 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
761 {
762 u_int cpumask;
763 u_int other_cpus;
764 vm_offset_t addr;
765
766 sched_pin();
767 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
768 for (addr = sva; addr < eva; addr += PAGE_SIZE)
769 invlpg(addr);
770 smp_invlpg_range(sva, eva);
771 } else {
772 cpumask = PCPU_GET(cpumask);
773 other_cpus = PCPU_GET(other_cpus);
774 if (pmap->pm_active & cpumask)
775 for (addr = sva; addr < eva; addr += PAGE_SIZE)
776 invlpg(addr);
777 if (pmap->pm_active & other_cpus)
778 smp_masked_invlpg_range(pmap->pm_active & other_cpus,
779 sva, eva);
780 }
781 sched_unpin();
782 }
783
784 void
785 pmap_invalidate_all(pmap_t pmap)
786 {
787 u_int cpumask;
788 u_int other_cpus;
789
790 sched_pin();
791 if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
792 invltlb();
793 smp_invltlb();
794 } else {
795 cpumask = PCPU_GET(cpumask);
796 other_cpus = PCPU_GET(other_cpus);
797 if (pmap->pm_active & cpumask)
798 invltlb();
799 if (pmap->pm_active & other_cpus)
800 smp_masked_invltlb(pmap->pm_active & other_cpus);
801 }
802 sched_unpin();
803 }
804
805 void
806 pmap_invalidate_cache(void)
807 {
808
809 sched_pin();
810 wbinvd();
811 smp_cache_flush();
812 sched_unpin();
813 }
814 #else /* !SMP */
815 /*
816 * Normal, non-SMP, 486+ invalidation functions.
817 * We inline these within pmap.c for speed.
818 */
819 PMAP_INLINE void
820 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
821 {
822
823 if (pmap == kernel_pmap || pmap->pm_active)
824 invlpg(va);
825 }
826
827 PMAP_INLINE void
828 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
829 {
830 vm_offset_t addr;
831
832 if (pmap == kernel_pmap || pmap->pm_active)
833 for (addr = sva; addr < eva; addr += PAGE_SIZE)
834 invlpg(addr);
835 }
836
837 PMAP_INLINE void
838 pmap_invalidate_all(pmap_t pmap)
839 {
840
841 if (pmap == kernel_pmap || pmap->pm_active)
842 invltlb();
843 }
844
845 PMAP_INLINE void
846 pmap_invalidate_cache(void)
847 {
848
849 wbinvd();
850 }
851 #endif /* !SMP */
852
853 /*
854 * Are we current address space or kernel? N.B. We return FALSE when
855 * a pmap's page table is in use because a kernel thread is borrowing
856 * it. The borrowed page table can change spontaneously, making any
857 * dependence on its continued use subject to a race condition.
858 */
859 static __inline int
860 pmap_is_current(pmap_t pmap)
861 {
862
863 return (pmap == kernel_pmap ||
864 (pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
865 (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
866 }
867
868 /*
869 * If the given pmap is not the current or kernel pmap, the returned pte must
870 * be released by passing it to pmap_pte_release().
871 */
872 pt_entry_t *
873 pmap_pte(pmap_t pmap, vm_offset_t va)
874 {
875 pd_entry_t newpf;
876 pd_entry_t *pde;
877
878 pde = pmap_pde(pmap, va);
879 if (*pde & PG_PS)
880 return (pde);
881 if (*pde != 0) {
882 /* are we current address space or kernel? */
883 if (pmap_is_current(pmap))
884 return (vtopte(va));
885 mtx_lock(&PMAP2mutex);
886 newpf = *pde & PG_FRAME;
887 if ((*PMAP2 & PG_FRAME) != newpf) {
888 *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
889 pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
890 }
891 return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
892 }
893 return (0);
894 }
895
896 /*
897 * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte
898 * being NULL.
899 */
900 static __inline void
901 pmap_pte_release(pt_entry_t *pte)
902 {
903
904 if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
905 mtx_unlock(&PMAP2mutex);
906 }
907
908 static __inline void
909 invlcaddr(void *caddr)
910 {
911
912 invlpg((u_int)caddr);
913 }
914
915 /*
916 * Super fast pmap_pte routine best used when scanning
917 * the pv lists. This eliminates many coarse-grained
918 * invltlb calls. Note that many of the pv list
919 * scans are across different pmaps. It is very wasteful
920 * to do an entire invltlb for checking a single mapping.
921 *
922 * If the given pmap is not the current pmap, vm_page_queue_mtx
923 * must be held and curthread pinned to a CPU.
924 */
925 static pt_entry_t *
926 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
927 {
928 pd_entry_t newpf;
929 pd_entry_t *pde;
930
931 pde = pmap_pde(pmap, va);
932 if (*pde & PG_PS)
933 return (pde);
934 if (*pde != 0) {
935 /* are we current address space or kernel? */
936 if (pmap_is_current(pmap))
937 return (vtopte(va));
938 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
939 KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
940 newpf = *pde & PG_FRAME;
941 if ((*PMAP1 & PG_FRAME) != newpf) {
942 *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
943 #ifdef SMP
944 PMAP1cpu = PCPU_GET(cpuid);
945 #endif
946 invlcaddr(PADDR1);
947 PMAP1changed++;
948 } else
949 #ifdef SMP
950 if (PMAP1cpu != PCPU_GET(cpuid)) {
951 PMAP1cpu = PCPU_GET(cpuid);
952 invlcaddr(PADDR1);
953 PMAP1changedcpu++;
954 } else
955 #endif
956 PMAP1unchanged++;
957 return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
958 }
959 return (0);
960 }
961
962 /*
963 * Routine: pmap_extract
964 * Function:
965 * Extract the physical page address associated
966 * with the given map/virtual_address pair.
967 */
968 vm_paddr_t
969 pmap_extract(pmap_t pmap, vm_offset_t va)
970 {
971 vm_paddr_t rtval;
972 pt_entry_t *pte;
973 pd_entry_t pde;
974
975 rtval = 0;
976 PMAP_LOCK(pmap);
977 pde = pmap->pm_pdir[va >> PDRSHIFT];
978 if (pde != 0) {
979 if ((pde & PG_PS) != 0) {
980 rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
981 PMAP_UNLOCK(pmap);
982 return rtval;
983 }
984 pte = pmap_pte(pmap, va);
985 rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
986 pmap_pte_release(pte);
987 }
988 PMAP_UNLOCK(pmap);
989 return (rtval);
990 }
991
992 /*
993 * Routine: pmap_extract_and_hold
994 * Function:
995 * Atomically extract and hold the physical page
996 * with the given pmap and virtual address pair
997 * if that mapping permits the given protection.
998 */
999 vm_page_t
1000 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1001 {
1002 pd_entry_t pde;
1003 pt_entry_t pte;
1004 vm_page_t m;
1005
1006 m = NULL;
1007 vm_page_lock_queues();
1008 PMAP_LOCK(pmap);
1009 pde = *pmap_pde(pmap, va);
1010 if (pde != 0) {
1011 if (pde & PG_PS) {
1012 if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
1013 m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
1014 (va & PDRMASK));
1015 vm_page_hold(m);
1016 }
1017 } else {
1018 sched_pin();
1019 pte = *pmap_pte_quick(pmap, va);
1020 if (pte != 0 &&
1021 ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
1022 m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
1023 vm_page_hold(m);
1024 }
1025 sched_unpin();
1026 }
1027 }
1028 vm_page_unlock_queues();
1029 PMAP_UNLOCK(pmap);
1030 return (m);
1031 }
1032
1033 /***************************************************
1034 * Low level mapping routines.....
1035 ***************************************************/
1036
1037 /*
1038 * Add a wired page to the kva.
1039 * Note: not SMP coherent.
1040 */
1041 PMAP_INLINE void
1042 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
1043 {
1044 pt_entry_t *pte;
1045
1046 pte = vtopte(va);
1047 pte_store(pte, pa | PG_RW | PG_V | pgeflag);
1048 }
1049
1050 PMAP_INLINE void
1051 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
1052 {
1053 pt_entry_t *pte;
1054
1055 pte = vtopte(va);
1056 pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
1057 }
1058
1059 /*
1060 * Remove a page from the kernel pagetables.
1061 * Note: not SMP coherent.
1062 */
1063 PMAP_INLINE void
1064 pmap_kremove(vm_offset_t va)
1065 {
1066 pt_entry_t *pte;
1067
1068 pte = vtopte(va);
1069 pte_clear(pte);
1070 }
1071
1072 /*
1073 * Used to map a range of physical addresses into kernel
1074 * virtual address space.
1075 *
1076 * The value passed in '*virt' is a suggested virtual address for
1077 * the mapping. Architectures which can support a direct-mapped
1078 * physical to virtual region can return the appropriate address
1079 * within that region, leaving '*virt' unchanged. Other
1080 * architectures should map the pages starting at '*virt' and
1081 * update '*virt' with the first usable address after the mapped
1082 * region.
1083 */
1084 vm_offset_t
1085 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1086 {
1087 vm_offset_t va, sva;
1088
1089 va = sva = *virt;
1090 while (start < end) {
1091 pmap_kenter(va, start);
1092 va += PAGE_SIZE;
1093 start += PAGE_SIZE;
1094 }
1095 pmap_invalidate_range(kernel_pmap, sva, va);
1096 *virt = va;
1097 return (sva);
1098 }
1099
1100
1101 /*
1102 * Add a list of wired pages to the kva
1103 * this routine is only used for temporary
1104 * kernel mappings that do not need to have
1105 * page modification or references recorded.
1106 * Note that old mappings are simply written
1107 * over. The page *must* be wired.
1108 * Note: SMP coherent. Uses a ranged shootdown IPI.
1109 */
1110 void
1111 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1112 {
1113 pt_entry_t *endpte, oldpte, *pte;
1114
1115 oldpte = 0;
1116 pte = vtopte(sva);
1117 endpte = pte + count;
1118 while (pte < endpte) {
1119 oldpte |= *pte;
1120 pte_store(pte, VM_PAGE_TO_PHYS(*ma) | pgeflag | PG_RW | PG_V);
1121 pte++;
1122 ma++;
1123 }
1124 if ((oldpte & PG_V) != 0)
1125 pmap_invalidate_range(kernel_pmap, sva, sva + count *
1126 PAGE_SIZE);
1127 }
1128
1129 /*
1130 * This routine tears out page mappings from the
1131 * kernel -- it is meant only for temporary mappings.
1132 * Note: SMP coherent. Uses a ranged shootdown IPI.
1133 */
1134 void
1135 pmap_qremove(vm_offset_t sva, int count)
1136 {
1137 vm_offset_t va;
1138
1139 va = sva;
1140 while (count-- > 0) {
1141 pmap_kremove(va);
1142 va += PAGE_SIZE;
1143 }
1144 pmap_invalidate_range(kernel_pmap, sva, va);
1145 }
1146
1147 /***************************************************
1148 * Page table page management routines.....
1149 ***************************************************/
1150 static __inline void
1151 pmap_free_zero_pages(vm_page_t free)
1152 {
1153 vm_page_t m;
1154
1155 while (free != NULL) {
1156 m = free;
1157 free = m->right;
1158 vm_page_free_zero(m);
1159 }
1160 }
1161
1162 /*
1163 * This routine unholds page table pages, and if the hold count
1164 * drops to zero, then it decrements the wire count.
1165 */
1166 static __inline int
1167 pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
1168 {
1169
1170 --m->wire_count;
1171 if (m->wire_count == 0)
1172 return _pmap_unwire_pte_hold(pmap, m, free);
1173 else
1174 return 0;
1175 }
1176
1177 static int
1178 _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
1179 {
1180 vm_offset_t pteva;
1181
1182 /*
1183 * unmap the page table page
1184 */
1185 pmap->pm_pdir[m->pindex] = 0;
1186 --pmap->pm_stats.resident_count;
1187
1188 /*
1189 * This is a release store so that the ordinary store unmapping
1190 * the page table page is globally performed before TLB shoot-
1191 * down is begun.
1192 */
1193 atomic_subtract_rel_int(&cnt.v_wire_count, 1);
1194
1195 /*
1196 * Do an invltlb to make the invalidated mapping
1197 * take effect immediately.
1198 */
1199 pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
1200 pmap_invalidate_page(pmap, pteva);
1201
1202 /*
1203 * Put page on a list so that it is released after
1204 * *ALL* TLB shootdown is done
1205 */
1206 m->right = *free;
1207 *free = m;
1208
1209 return 1;
1210 }
1211
1212 /*
1213 * After removing a page table entry, this routine is used to
1214 * conditionally free the page, and manage the hold/wire counts.
1215 */
1216 static int
1217 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free)
1218 {
1219 pd_entry_t ptepde;
1220 vm_page_t mpte;
1221
1222 if (va >= VM_MAXUSER_ADDRESS)
1223 return 0;
1224 ptepde = *pmap_pde(pmap, va);
1225 mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
1226 return pmap_unwire_pte_hold(pmap, mpte, free);
1227 }
1228
1229 void
1230 pmap_pinit0(pmap_t pmap)
1231 {
1232
1233 PMAP_LOCK_INIT(pmap);
1234 pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
1235 #ifdef PAE
1236 pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
1237 #endif
1238 pmap->pm_active = 0;
1239 PCPU_SET(curpmap, pmap);
1240 TAILQ_INIT(&pmap->pm_pvchunk);
1241 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1242 mtx_lock_spin(&allpmaps_lock);
1243 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1244 mtx_unlock_spin(&allpmaps_lock);
1245 }
1246
1247 /*
1248 * Initialize a preallocated and zeroed pmap structure,
1249 * such as one in a vmspace structure.
1250 */
1251 int
1252 pmap_pinit(pmap_t pmap)
1253 {
1254 vm_page_t m, ptdpg[NPGPTD];
1255 vm_paddr_t pa;
1256 static int color;
1257 int i;
1258
1259 PMAP_LOCK_INIT(pmap);
1260
1261 /*
1262 * No need to allocate page table space yet but we do need a valid
1263 * page directory table.
1264 */
1265 if (pmap->pm_pdir == NULL) {
1266 pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
1267 NBPTD);
1268
1269 if (pmap->pm_pdir == NULL) {
1270 PMAP_LOCK_DESTROY(pmap);
1271 return (0);
1272 }
1273 #ifdef PAE
1274 pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
1275 KASSERT(((vm_offset_t)pmap->pm_pdpt &
1276 ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
1277 ("pmap_pinit: pdpt misaligned"));
1278 KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
1279 ("pmap_pinit: pdpt above 4g"));
1280 #endif
1281 }
1282
1283 /*
1284 * allocate the page directory page(s)
1285 */
1286 for (i = 0; i < NPGPTD;) {
1287 m = vm_page_alloc(NULL, color++,
1288 VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1289 VM_ALLOC_ZERO);
1290 if (m == NULL)
1291 VM_WAIT;
1292 else {
1293 ptdpg[i++] = m;
1294 }
1295 }
1296
1297 pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
1298
1299 for (i = 0; i < NPGPTD; i++) {
1300 if ((ptdpg[i]->flags & PG_ZERO) == 0)
1301 bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE);
1302 }
1303
1304 mtx_lock_spin(&allpmaps_lock);
1305 LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1306 mtx_unlock_spin(&allpmaps_lock);
1307 /* Wire in kernel global address entries. */
1308 /* XXX copies current process, does not fill in MPPTDI */
1309 bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
1310 #ifdef SMP
1311 pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
1312 #endif
1313
1314 /* install self-referential address mapping entry(s) */
1315 for (i = 0; i < NPGPTD; i++) {
1316 pa = VM_PAGE_TO_PHYS(ptdpg[i]);
1317 pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
1318 #ifdef PAE
1319 pmap->pm_pdpt[i] = pa | PG_V;
1320 #endif
1321 }
1322
1323 pmap->pm_active = 0;
1324 TAILQ_INIT(&pmap->pm_pvchunk);
1325 bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
1326
1327 return (1);
1328 }
1329
1330 /*
1331 * this routine is called if the page table page is not
1332 * mapped correctly.
1333 */
1334 static vm_page_t
1335 _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags)
1336 {
1337 vm_paddr_t ptepa;
1338 vm_page_t m;
1339
1340 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1341 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1342 ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1343
1344 /*
1345 * Allocate a page table page.
1346 */
1347 if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1348 VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1349 if (flags & M_WAITOK) {
1350 PMAP_UNLOCK(pmap);
1351 vm_page_unlock_queues();
1352 VM_WAIT;
1353 vm_page_lock_queues();
1354 PMAP_LOCK(pmap);
1355 }
1356
1357 /*
1358 * Indicate the need to retry. While waiting, the page table
1359 * page may have been allocated.
1360 */
1361 return (NULL);
1362 }
1363 if ((m->flags & PG_ZERO) == 0)
1364 pmap_zero_page(m);
1365
1366 /*
1367 * Map the pagetable page into the process address space, if
1368 * it isn't already there.
1369 */
1370
1371 pmap->pm_stats.resident_count++;
1372
1373 ptepa = VM_PAGE_TO_PHYS(m);
1374 pmap->pm_pdir[ptepindex] =
1375 (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
1376
1377 return m;
1378 }
1379
1380 static vm_page_t
1381 pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
1382 {
1383 unsigned ptepindex;
1384 pd_entry_t ptepa;
1385 vm_page_t m;
1386
1387 KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
1388 (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
1389 ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
1390
1391 /*
1392 * Calculate pagetable page index
1393 */
1394 ptepindex = va >> PDRSHIFT;
1395 retry:
1396 /*
1397 * Get the page directory entry
1398 */
1399 ptepa = pmap->pm_pdir[ptepindex];
1400
1401 /*
1402 * This supports switching from a 4MB page to a
1403 * normal 4K page.
1404 */
1405 if (ptepa & PG_PS) {
1406 pmap->pm_pdir[ptepindex] = 0;
1407 ptepa = 0;
1408 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
1409 pmap_invalidate_all(kernel_pmap);
1410 }
1411
1412 /*
1413 * If the page table page is mapped, we just increment the
1414 * hold count, and activate it.
1415 */
1416 if (ptepa) {
1417 m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
1418 m->wire_count++;
1419 } else {
1420 /*
1421 * Here if the pte page isn't mapped, or if it has
1422 * been deallocated.
1423 */
1424 m = _pmap_allocpte(pmap, ptepindex, flags);
1425 if (m == NULL && (flags & M_WAITOK))
1426 goto retry;
1427 }
1428 return (m);
1429 }
1430
1431
1432 /***************************************************
1433 * Pmap allocation/deallocation routines.
1434 ***************************************************/
1435
1436 #ifdef SMP
1437 /*
1438 * Deal with a SMP shootdown of other users of the pmap that we are
1439 * trying to dispose of. This can be a bit hairy.
1440 */
1441 static u_int *lazymask;
1442 static u_int lazyptd;
1443 static volatile u_int lazywait;
1444
1445 void pmap_lazyfix_action(void);
1446
1447 void
1448 pmap_lazyfix_action(void)
1449 {
1450 u_int mymask = PCPU_GET(cpumask);
1451
1452 #ifdef COUNT_IPIS
1453 (*ipi_lazypmap_counts[PCPU_GET(cpuid)])++;
1454 #endif
1455 if (rcr3() == lazyptd)
1456 load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1457 atomic_clear_int(lazymask, mymask);
1458 atomic_store_rel_int(&lazywait, 1);
1459 }
1460
1461 static void
1462 pmap_lazyfix_self(u_int mymask)
1463 {
1464
1465 if (rcr3() == lazyptd)
1466 load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1467 atomic_clear_int(lazymask, mymask);
1468 }
1469
1470
1471 static void
1472 pmap_lazyfix(pmap_t pmap)
1473 {
1474 u_int mymask;
1475 u_int mask;
1476 u_int spins;
1477
1478 while ((mask = pmap->pm_active) != 0) {
1479 spins = 50000000;
1480 mask = mask & -mask; /* Find least significant set bit */
1481 mtx_lock_spin(&smp_ipi_mtx);
1482 #ifdef PAE
1483 lazyptd = vtophys(pmap->pm_pdpt);
1484 #else
1485 lazyptd = vtophys(pmap->pm_pdir);
1486 #endif
1487 mymask = PCPU_GET(cpumask);
1488 if (mask == mymask) {
1489 lazymask = &pmap->pm_active;
1490 pmap_lazyfix_self(mymask);
1491 } else {
1492 atomic_store_rel_int((u_int *)&lazymask,
1493 (u_int)&pmap->pm_active);
1494 atomic_store_rel_int(&lazywait, 0);
1495 ipi_selected(mask, IPI_LAZYPMAP);
1496 while (lazywait == 0) {
1497 ia32_pause();
1498 if (--spins == 0)
1499 break;
1500 }
1501 }
1502 mtx_unlock_spin(&smp_ipi_mtx);
1503 if (spins == 0)
1504 printf("pmap_lazyfix: spun for 50000000\n");
1505 }
1506 }
1507
1508 #else /* SMP */
1509
1510 /*
1511 * Cleaning up on uniprocessor is easy. For various reasons, we're
1512 * unlikely to have to even execute this code, including the fact
1513 * that the cleanup is deferred until the parent does a wait(2), which
1514 * means that another userland process has run.
1515 */
1516 static void
1517 pmap_lazyfix(pmap_t pmap)
1518 {
1519 u_int cr3;
1520
1521 cr3 = vtophys(pmap->pm_pdir);
1522 if (cr3 == rcr3()) {
1523 load_cr3(PCPU_GET(curpcb)->pcb_cr3);
1524 pmap->pm_active &= ~(PCPU_GET(cpumask));
1525 }
1526 }
1527 #endif /* SMP */
1528
1529 /*
1530 * Release any resources held by the given physical map.
1531 * Called when a pmap initialized by pmap_pinit is being released.
1532 * Should only be called if the map contains no valid mappings.
1533 */
1534 void
1535 pmap_release(pmap_t pmap)
1536 {
1537 vm_page_t m, ptdpg[NPGPTD];
1538 int i;
1539
1540 KASSERT(pmap->pm_stats.resident_count == 0,
1541 ("pmap_release: pmap resident count %ld != 0",
1542 pmap->pm_stats.resident_count));
1543
1544 pmap_lazyfix(pmap);
1545 mtx_lock_spin(&allpmaps_lock);
1546 LIST_REMOVE(pmap, pm_list);
1547 mtx_unlock_spin(&allpmaps_lock);
1548
1549 for (i = 0; i < NPGPTD; i++)
1550 ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
1551 PG_FRAME);
1552
1553 bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
1554 sizeof(*pmap->pm_pdir));
1555 #ifdef SMP
1556 pmap->pm_pdir[MPPTDI] = 0;
1557 #endif
1558
1559 pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
1560
1561 for (i = 0; i < NPGPTD; i++) {
1562 m = ptdpg[i];
1563 #ifdef PAE
1564 KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
1565 ("pmap_release: got wrong ptd page"));
1566 #endif
1567 m->wire_count--;
1568 atomic_subtract_int(&cnt.v_wire_count, 1);
1569 vm_page_free_zero(m);
1570 }
1571 PMAP_LOCK_DESTROY(pmap);
1572 }
1573
1574 static int
1575 kvm_size(SYSCTL_HANDLER_ARGS)
1576 {
1577 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
1578
1579 return sysctl_handle_long(oidp, &ksize, 0, req);
1580 }
1581 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
1582 0, 0, kvm_size, "IU", "Size of KVM");
1583
1584 static int
1585 kvm_free(SYSCTL_HANDLER_ARGS)
1586 {
1587 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1588
1589 return sysctl_handle_long(oidp, &kfree, 0, req);
1590 }
1591 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
1592 0, 0, kvm_free, "IU", "Amount of KVM free");
1593
1594 /*
1595 * grow the number of kernel page table entries, if needed
1596 */
1597 void
1598 pmap_growkernel(vm_offset_t addr)
1599 {
1600 struct pmap *pmap;
1601 vm_paddr_t ptppaddr;
1602 vm_page_t nkpg;
1603 pd_entry_t newpdir;
1604 pt_entry_t *pde;
1605
1606 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1607 if (kernel_vm_end == 0) {
1608 kernel_vm_end = KERNBASE;
1609 nkpt = 0;
1610 while (pdir_pde(PTD, kernel_vm_end)) {
1611 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1612 nkpt++;
1613 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1614 kernel_vm_end = kernel_map->max_offset;
1615 break;
1616 }
1617 }
1618 }
1619 addr = roundup2(addr, PAGE_SIZE * NPTEPG);
1620 if (addr - 1 >= kernel_map->max_offset)
1621 addr = kernel_map->max_offset;
1622 while (kernel_vm_end < addr) {
1623 if (pdir_pde(PTD, kernel_vm_end)) {
1624 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1625 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1626 kernel_vm_end = kernel_map->max_offset;
1627 break;
1628 }
1629 continue;
1630 }
1631
1632 /*
1633 * This index is bogus, but out of the way
1634 */
1635 nkpg = vm_page_alloc(NULL, nkpt,
1636 VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
1637 if (!nkpg)
1638 panic("pmap_growkernel: no memory to grow kernel");
1639
1640 nkpt++;
1641
1642 pmap_zero_page(nkpg);
1643 ptppaddr = VM_PAGE_TO_PHYS(nkpg);
1644 newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
1645 pdir_pde(PTD, kernel_vm_end) = newpdir;
1646
1647 mtx_lock_spin(&allpmaps_lock);
1648 LIST_FOREACH(pmap, &allpmaps, pm_list) {
1649 pde = pmap_pde(pmap, kernel_vm_end);
1650 pde_store(pde, newpdir);
1651 }
1652 mtx_unlock_spin(&allpmaps_lock);
1653 kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
1654 if (kernel_vm_end - 1 >= kernel_map->max_offset) {
1655 kernel_vm_end = kernel_map->max_offset;
1656 break;
1657 }
1658 }
1659 }
1660
1661
1662 /***************************************************
1663 * page management routines.
1664 ***************************************************/
1665
1666 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
1667 CTASSERT(_NPCM == 11);
1668
1669 static __inline struct pv_chunk *
1670 pv_to_chunk(pv_entry_t pv)
1671 {
1672
1673 return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
1674 }
1675
1676 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1677
1678 #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */
1679 #define PC_FREE10 0x0000fffful /* Free values for index 10 */
1680
1681 static uint32_t pc_freemask[11] = {
1682 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
1683 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
1684 PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
1685 PC_FREE0_9, PC_FREE10
1686 };
1687
1688 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
1689 "Current number of pv entries");
1690
1691 #ifdef PV_STATS
1692 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
1693
1694 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
1695 "Current number of pv entry chunks");
1696 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
1697 "Current number of pv entry chunks allocated");
1698 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
1699 "Current number of pv entry chunks frees");
1700 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
1701 "Number of times tried to get a chunk page but failed.");
1702
1703 static long pv_entry_frees, pv_entry_allocs;
1704 static int pv_entry_spare;
1705
1706 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
1707 "Current number of pv entry frees");
1708 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
1709 "Current number of pv entry allocs");
1710 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
1711 "Current number of spare pv entries");
1712
1713 static int pmap_collect_inactive, pmap_collect_active;
1714
1715 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
1716 "Current number times pmap_collect called on inactive queue");
1717 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
1718 "Current number times pmap_collect called on active queue");
1719 #endif
1720
1721 /*
1722 * We are in a serious low memory condition. Resort to
1723 * drastic measures to free some pages so we can allocate
1724 * another pv entry chunk. This is normally called to
1725 * unmap inactive pages, and if necessary, active pages.
1726 */
1727 static void
1728 pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
1729 {
1730 pmap_t pmap;
1731 pt_entry_t *pte, tpte;
1732 pv_entry_t next_pv, pv;
1733 vm_offset_t va;
1734 vm_page_t m, free;
1735
1736 sched_pin();
1737 TAILQ_FOREACH(m, &vpq->pl, pageq) {
1738 if (m->hold_count || m->busy)
1739 continue;
1740 TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
1741 va = pv->pv_va;
1742 pmap = PV_PMAP(pv);
1743 /* Avoid deadlock and lock recursion. */
1744 if (pmap > locked_pmap)
1745 PMAP_LOCK(pmap);
1746 else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
1747 continue;
1748 pmap->pm_stats.resident_count--;
1749 pte = pmap_pte_quick(pmap, va);
1750 tpte = pte_load_clear(pte);
1751 KASSERT((tpte & PG_W) == 0,
1752 ("pmap_collect: wired pte %#jx", (uintmax_t)tpte));
1753 if (tpte & PG_A)
1754 vm_page_flag_set(m, PG_REFERENCED);
1755 if (tpte & PG_M) {
1756 KASSERT((tpte & PG_RW),
1757 ("pmap_collect: modified page not writable: va: %#x, pte: %#jx",
1758 va, (uintmax_t)tpte));
1759 vm_page_dirty(m);
1760 }
1761 free = NULL;
1762 pmap_unuse_pt(pmap, va, &free);
1763 pmap_invalidate_page(pmap, va);
1764 pmap_free_zero_pages(free);
1765 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1766 if (TAILQ_EMPTY(&m->md.pv_list))
1767 vm_page_flag_clear(m, PG_WRITEABLE);
1768 m->md.pv_list_count--;
1769 free_pv_entry(pmap, pv);
1770 if (pmap != locked_pmap)
1771 PMAP_UNLOCK(pmap);
1772 }
1773 }
1774 sched_unpin();
1775 }
1776
1777
1778 /*
1779 * free the pv_entry back to the free list
1780 */
1781 static void
1782 free_pv_entry(pmap_t pmap, pv_entry_t pv)
1783 {
1784 vm_page_t m;
1785 struct pv_chunk *pc;
1786 int idx, field, bit;
1787
1788 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1789 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1790 PV_STAT(pv_entry_frees++);
1791 PV_STAT(pv_entry_spare++);
1792 pv_entry_count--;
1793 pc = pv_to_chunk(pv);
1794 idx = pv - &pc->pc_pventry[0];
1795 field = idx / 32;
1796 bit = idx % 32;
1797 pc->pc_map[field] |= 1ul << bit;
1798 /* move to head of list */
1799 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1800 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1801 for (idx = 0; idx < _NPCM; idx++)
1802 if (pc->pc_map[idx] != pc_freemask[idx])
1803 return;
1804 PV_STAT(pv_entry_spare -= _NPCPV);
1805 PV_STAT(pc_chunk_count--);
1806 PV_STAT(pc_chunk_frees++);
1807 /* entire chunk is free, return it */
1808 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1809 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
1810 pmap_qremove((vm_offset_t)pc, 1);
1811 vm_page_unwire(m, 0);
1812 vm_page_free(m);
1813 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
1814 }
1815
1816 /*
1817 * get a new pv_entry, allocating a block from the system
1818 * when needed.
1819 */
1820 static pv_entry_t
1821 get_pv_entry(pmap_t pmap, int try)
1822 {
1823 static const struct timeval printinterval = { 60, 0 };
1824 static struct timeval lastprint;
1825 static vm_pindex_t colour;
1826 int bit, field;
1827 pv_entry_t pv;
1828 struct pv_chunk *pc;
1829 vm_page_t m;
1830
1831 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1832 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1833 PV_STAT(pv_entry_allocs++);
1834 pv_entry_count++;
1835 if (pv_entry_count > pv_entry_high_water)
1836 if (ratecheck(&lastprint, &printinterval))
1837 printf("Approaching the limit on PV entries, consider "
1838 "increasing either the vm.pmap.shpgperproc or the "
1839 "vm.pmap.pv_entry_max tunable.\n");
1840 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1841 if (pc != NULL) {
1842 for (field = 0; field < _NPCM; field++) {
1843 if (pc->pc_map[field]) {
1844 bit = bsfl(pc->pc_map[field]);
1845 break;
1846 }
1847 }
1848 if (field < _NPCM) {
1849 pv = &pc->pc_pventry[field * 32 + bit];
1850 pc->pc_map[field] &= ~(1ul << bit);
1851 /* If this was the last item, move it to tail */
1852 for (field = 0; field < _NPCM; field++)
1853 if (pc->pc_map[field] != 0) {
1854 PV_STAT(pv_entry_spare--);
1855 return (pv); /* not full, return */
1856 }
1857 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1858 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
1859 PV_STAT(pv_entry_spare--);
1860 return (pv);
1861 }
1862 }
1863 pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
1864 m = vm_page_alloc(NULL, colour, VM_ALLOC_NORMAL |
1865 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
1866 if (m == NULL || pc == NULL) {
1867 if (try) {
1868 pv_entry_count--;
1869 PV_STAT(pc_chunk_tryfail++);
1870 if (m) {
1871 vm_page_lock_queues();
1872 vm_page_unwire(m, 0);
1873 vm_page_free(m);
1874 vm_page_unlock_queues();
1875 }
1876 if (pc)
1877 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
1878 return (NULL);
1879 }
1880 /*
1881 * Reclaim pv entries: At first, destroy mappings to
1882 * inactive pages. After that, if a pv chunk entry
1883 * is still needed, destroy mappings to active pages.
1884 */
1885 PV_STAT(pmap_collect_inactive++);
1886 pmap_collect(pmap, &vm_page_queues[PQ_INACTIVE]);
1887 if (m == NULL)
1888 m = vm_page_alloc(NULL, colour, VM_ALLOC_NORMAL |
1889 VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
1890 if (pc == NULL)
1891 pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
1892 if (m == NULL || pc == NULL) {
1893 PV_STAT(pmap_collect_active++);
1894 pmap_collect(pmap, &vm_page_queues[PQ_ACTIVE]);
1895 if (m == NULL)
1896 m = vm_page_alloc(NULL, colour,
1897 VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
1898 VM_ALLOC_WIRED);
1899 if (pc == NULL)
1900 pc = (struct pv_chunk *)
1901 pmap_ptelist_alloc(&pv_vafree);
1902 if (m == NULL || pc == NULL)
1903 panic("get_pv_entry: increase vm.pmap.shpgperproc");
1904 }
1905 }
1906 PV_STAT(pc_chunk_count++);
1907 PV_STAT(pc_chunk_allocs++);
1908 colour++;
1909 pmap_qenter((vm_offset_t)pc, &m, 1);
1910 pc->pc_pmap = pmap;
1911 pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */
1912 for (field = 1; field < _NPCM; field++)
1913 pc->pc_map[field] = pc_freemask[field];
1914 pv = &pc->pc_pventry[0];
1915 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1916 PV_STAT(pv_entry_spare += _NPCPV - 1);
1917 return (pv);
1918 }
1919
1920 static void
1921 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
1922 {
1923 pv_entry_t pv;
1924
1925 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1926 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1927 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
1928 if (pmap == PV_PMAP(pv) && va == pv->pv_va)
1929 break;
1930 }
1931 KASSERT(pv != NULL, ("pmap_remove_entry: pv not found"));
1932 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
1933 m->md.pv_list_count--;
1934 if (TAILQ_EMPTY(&m->md.pv_list))
1935 vm_page_flag_clear(m, PG_WRITEABLE);
1936 free_pv_entry(pmap, pv);
1937 }
1938
1939 /*
1940 * Create a pv entry for page at pa for
1941 * (pmap, va).
1942 */
1943 static void
1944 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1945 {
1946 pv_entry_t pv;
1947
1948 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1949 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1950 pv = get_pv_entry(pmap, FALSE);
1951 pv->pv_va = va;
1952 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1953 m->md.pv_list_count++;
1954 }
1955
1956 /*
1957 * Conditionally create a pv entry.
1958 */
1959 static boolean_t
1960 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
1961 {
1962 pv_entry_t pv;
1963
1964 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1965 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1966 if (pv_entry_count < pv_entry_high_water &&
1967 (pv = get_pv_entry(pmap, TRUE)) != NULL) {
1968 pv->pv_va = va;
1969 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
1970 m->md.pv_list_count++;
1971 return (TRUE);
1972 } else
1973 return (FALSE);
1974 }
1975
1976 /*
1977 * pmap_remove_pte: do the things to unmap a page in a process
1978 */
1979 static int
1980 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free)
1981 {
1982 pt_entry_t oldpte;
1983 vm_page_t m;
1984
1985 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
1986 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1987 oldpte = pte_load_clear(ptq);
1988 if (oldpte & PG_W)
1989 pmap->pm_stats.wired_count -= 1;
1990 /*
1991 * Machines that don't support invlpg, also don't support
1992 * PG_G.
1993 */
1994 if (oldpte & PG_G)
1995 pmap_invalidate_page(kernel_pmap, va);
1996 pmap->pm_stats.resident_count -= 1;
1997 if (oldpte & PG_MANAGED) {
1998 m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
1999 if (oldpte & PG_M) {
2000 KASSERT((oldpte & PG_RW),
2001 ("pmap_remove_pte: modified page not writable: va: %#x, pte: %#jx",
2002 va, (uintmax_t)oldpte));
2003 vm_page_dirty(m);
2004 }
2005 if (oldpte & PG_A)
2006 vm_page_flag_set(m, PG_REFERENCED);
2007 pmap_remove_entry(pmap, m, va);
2008 }
2009 return (pmap_unuse_pt(pmap, va, free));
2010 }
2011
2012 /*
2013 * Remove a single page from a process address space
2014 */
2015 static void
2016 pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free)
2017 {
2018 pt_entry_t *pte;
2019
2020 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2021 KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
2022 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2023 if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
2024 return;
2025 pmap_remove_pte(pmap, pte, va, free);
2026 pmap_invalidate_page(pmap, va);
2027 }
2028
2029 /*
2030 * Remove the given range of addresses from the specified map.
2031 *
2032 * It is assumed that the start and end are properly
2033 * rounded to the page size.
2034 */
2035 void
2036 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2037 {
2038 vm_offset_t pdnxt;
2039 pd_entry_t ptpaddr;
2040 pt_entry_t *pte;
2041 vm_page_t free = NULL;
2042 int anyvalid;
2043
2044 /*
2045 * Perform an unsynchronized read. This is, however, safe.
2046 */
2047 if (pmap->pm_stats.resident_count == 0)
2048 return;
2049
2050 anyvalid = 0;
2051
2052 vm_page_lock_queues();
2053 sched_pin();
2054 PMAP_LOCK(pmap);
2055
2056 /*
2057 * special handling of removing one page. a very
2058 * common operation and easy to short circuit some
2059 * code.
2060 */
2061 if ((sva + PAGE_SIZE == eva) &&
2062 ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
2063 pmap_remove_page(pmap, sva, &free);
2064 goto out;
2065 }
2066
2067 for (; sva < eva; sva = pdnxt) {
2068 unsigned pdirindex;
2069
2070 /*
2071 * Calculate index for next page table.
2072 */
2073 pdnxt = (sva + NBPDR) & ~PDRMASK;
2074 if (pmap->pm_stats.resident_count == 0)
2075 break;
2076
2077 pdirindex = sva >> PDRSHIFT;
2078 ptpaddr = pmap->pm_pdir[pdirindex];
2079
2080 /*
2081 * Weed out invalid mappings. Note: we assume that the page
2082 * directory table is always allocated, and in kernel virtual.
2083 */
2084 if (ptpaddr == 0)
2085 continue;
2086
2087 /*
2088 * Check for large page.
2089 */
2090 if ((ptpaddr & PG_PS) != 0) {
2091 pmap->pm_pdir[pdirindex] = 0;
2092 pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
2093 anyvalid = 1;
2094 continue;
2095 }
2096
2097 /*
2098 * Limit our scan to either the end of the va represented
2099 * by the current page table page, or to the end of the
2100 * range being removed.
2101 */
2102 if (pdnxt > eva)
2103 pdnxt = eva;
2104
2105 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
2106 sva += PAGE_SIZE) {
2107 if (*pte == 0)
2108 continue;
2109
2110 /*
2111 * The TLB entry for a PG_G mapping is invalidated
2112 * by pmap_remove_pte().
2113 */
2114 if ((*pte & PG_G) == 0)
2115 anyvalid = 1;
2116 if (pmap_remove_pte(pmap, pte, sva, &free))
2117 break;
2118 }
2119 }
2120 out:
2121 sched_unpin();
2122 if (anyvalid)
2123 pmap_invalidate_all(pmap);
2124 vm_page_unlock_queues();
2125 PMAP_UNLOCK(pmap);
2126 pmap_free_zero_pages(free);
2127 }
2128
2129 /*
2130 * Routine: pmap_remove_all
2131 * Function:
2132 * Removes this physical page from
2133 * all physical maps in which it resides.
2134 * Reflects back modify bits to the pager.
2135 *
2136 * Notes:
2137 * Original versions of this routine were very
2138 * inefficient because they iteratively called
2139 * pmap_remove (slow...)
2140 */
2141
2142 void
2143 pmap_remove_all(vm_page_t m)
2144 {
2145 pv_entry_t pv;
2146 pmap_t pmap;
2147 pt_entry_t *pte, tpte;
2148 vm_page_t free;
2149
2150 #if defined(PMAP_DIAGNOSTIC)
2151 /*
2152 * XXX This makes pmap_remove_all() illegal for non-managed pages!
2153 */
2154 if (m->flags & PG_FICTITIOUS) {
2155 panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x",
2156 VM_PAGE_TO_PHYS(m));
2157 }
2158 #endif
2159 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2160 sched_pin();
2161 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2162 pmap = PV_PMAP(pv);
2163 PMAP_LOCK(pmap);
2164 pmap->pm_stats.resident_count--;
2165 pte = pmap_pte_quick(pmap, pv->pv_va);
2166 tpte = pte_load_clear(pte);
2167 if (tpte & PG_W)
2168 pmap->pm_stats.wired_count--;
2169 if (tpte & PG_A)
2170 vm_page_flag_set(m, PG_REFERENCED);
2171
2172 /*
2173 * Update the vm_page_t clean and reference bits.
2174 */
2175 if (tpte & PG_M) {
2176 KASSERT((tpte & PG_RW),
2177 ("pmap_remove_all: modified page not writable: va: %#x, pte: %#jx",
2178 pv->pv_va, (uintmax_t)tpte));
2179 vm_page_dirty(m);
2180 }
2181 free = NULL;
2182 pmap_unuse_pt(pmap, pv->pv_va, &free);
2183 pmap_invalidate_page(pmap, pv->pv_va);
2184 pmap_free_zero_pages(free);
2185 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
2186 m->md.pv_list_count--;
2187 free_pv_entry(pmap, pv);
2188 PMAP_UNLOCK(pmap);
2189 }
2190 vm_page_flag_clear(m, PG_WRITEABLE);
2191 sched_unpin();
2192 }
2193
2194 /*
2195 * Set the physical protection on the
2196 * specified range of this map as requested.
2197 */
2198 void
2199 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2200 {
2201 vm_offset_t pdnxt;
2202 pd_entry_t ptpaddr;
2203 pt_entry_t *pte;
2204 int anychanged;
2205
2206 if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2207 pmap_remove(pmap, sva, eva);
2208 return;
2209 }
2210
2211 #ifdef PAE
2212 if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
2213 (VM_PROT_WRITE|VM_PROT_EXECUTE))
2214 return;
2215 #else
2216 if (prot & VM_PROT_WRITE)
2217 return;
2218 #endif
2219
2220 anychanged = 0;
2221
2222 vm_page_lock_queues();
2223 sched_pin();
2224 PMAP_LOCK(pmap);
2225 for (; sva < eva; sva = pdnxt) {
2226 pt_entry_t obits, pbits;
2227 unsigned pdirindex;
2228
2229 pdnxt = (sva + NBPDR) & ~PDRMASK;
2230
2231 pdirindex = sva >> PDRSHIFT;
2232 ptpaddr = pmap->pm_pdir[pdirindex];
2233
2234 /*
2235 * Weed out invalid mappings. Note: we assume that the page
2236 * directory table is always allocated, and in kernel virtual.
2237 */
2238 if (ptpaddr == 0)
2239 continue;
2240
2241 /*
2242 * Check for large page.
2243 */
2244 if ((ptpaddr & PG_PS) != 0) {
2245 if ((prot & VM_PROT_WRITE) == 0)
2246 pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
2247 #ifdef PAE
2248 if ((prot & VM_PROT_EXECUTE) == 0)
2249 pmap->pm_pdir[pdirindex] |= pg_nx;
2250 #endif
2251 anychanged = 1;
2252 continue;
2253 }
2254
2255 if (pdnxt > eva)
2256 pdnxt = eva;
2257
2258 for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
2259 sva += PAGE_SIZE) {
2260 vm_page_t m;
2261
2262 retry:
2263 /*
2264 * Regardless of whether a pte is 32 or 64 bits in
2265 * size, PG_RW, PG_A, and PG_M are among the least
2266 * significant 32 bits.
2267 */
2268 obits = pbits = *pte;
2269 if ((pbits & PG_V) == 0)
2270 continue;
2271 if (pbits & PG_MANAGED) {
2272 m = NULL;
2273 if (pbits & PG_A) {
2274 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
2275 vm_page_flag_set(m, PG_REFERENCED);
2276 pbits &= ~PG_A;
2277 }
2278 if ((pbits & PG_M) != 0) {
2279 if (m == NULL)
2280 m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
2281 vm_page_dirty(m);
2282 }
2283 }
2284
2285 if ((prot & VM_PROT_WRITE) == 0)
2286 pbits &= ~(PG_RW | PG_M);
2287 #ifdef PAE
2288 if ((prot & VM_PROT_EXECUTE) == 0)
2289 pbits |= pg_nx;
2290 #endif
2291
2292 if (pbits != obits) {
2293 #ifdef PAE
2294 if (!atomic_cmpset_64(pte, obits, pbits))
2295 goto retry;
2296 #else
2297 if (!atomic_cmpset_int((u_int *)pte, obits,
2298 pbits))
2299 goto retry;
2300 #endif
2301 if (obits & PG_G)
2302 pmap_invalidate_page(pmap, sva);
2303 else
2304 anychanged = 1;
2305 }
2306 }
2307 }
2308 sched_unpin();
2309 if (anychanged)
2310 pmap_invalidate_all(pmap);
2311 vm_page_unlock_queues();
2312 PMAP_UNLOCK(pmap);
2313 }
2314
2315 /*
2316 * Insert the given physical page (p) at
2317 * the specified virtual address (v) in the
2318 * target physical map with the protection requested.
2319 *
2320 * If specified, the page will be wired down, meaning
2321 * that the related pte can not be reclaimed.
2322 *
2323 * NB: This is the only routine which MAY NOT lazy-evaluate
2324 * or lose information. That is, this routine must actually
2325 * insert this page into the given map NOW.
2326 */
2327 void
2328 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2329 boolean_t wired)
2330 {
2331 vm_paddr_t pa;
2332 pd_entry_t *pde;
2333 pt_entry_t *pte;
2334 vm_paddr_t opa;
2335 pt_entry_t origpte, newpte;
2336 vm_page_t mpte, om;
2337 boolean_t invlva;
2338
2339 va = trunc_page(va);
2340 #ifdef PMAP_DIAGNOSTIC
2341 if (va > VM_MAX_KERNEL_ADDRESS)
2342 panic("pmap_enter: toobig");
2343 if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
2344 panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
2345 #endif
2346
2347 mpte = NULL;
2348
2349 vm_page_lock_queues();
2350 PMAP_LOCK(pmap);
2351 sched_pin();
2352
2353 /*
2354 * In the case that a page table page is not
2355 * resident, we are creating it here.
2356 */
2357 if (va < VM_MAXUSER_ADDRESS) {
2358 mpte = pmap_allocpte(pmap, va, M_WAITOK);
2359 }
2360 #if 0 && defined(PMAP_DIAGNOSTIC)
2361 else {
2362 pd_entry_t *pdeaddr = pmap_pde(pmap, va);
2363 origpte = *pdeaddr;
2364 if ((origpte & PG_V) == 0) {
2365 panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n",
2366 pmap->pm_pdir[PTDPTDI], origpte, va);
2367 }
2368 }
2369 #endif
2370
2371 pde = pmap_pde(pmap, va);
2372 if ((*pde & PG_PS) != 0)
2373 panic("pmap_enter: attempted pmap_enter on 4MB page");
2374 pte = pmap_pte_quick(pmap, va);
2375
2376 /*
2377 * Page Directory table entry not valid, we need a new PT page
2378 */
2379 if (pte == NULL) {
2380 panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n",
2381 (uintmax_t)pmap->pm_pdir[PTDPTDI], va);
2382 }
2383
2384 pa = VM_PAGE_TO_PHYS(m);
2385 om = NULL;
2386 origpte = *pte;
2387 opa = origpte & PG_FRAME;
2388
2389 /*
2390 * Mapping has not changed, must be protection or wiring change.
2391 */
2392 if (origpte && (opa == pa)) {
2393 /*
2394 * Wiring change, just update stats. We don't worry about
2395 * wiring PT pages as they remain resident as long as there
2396 * are valid mappings in them. Hence, if a user page is wired,
2397 * the PT page will be also.
2398 */
2399 if (wired && ((origpte & PG_W) == 0))
2400 pmap->pm_stats.wired_count++;
2401 else if (!wired && (origpte & PG_W))
2402 pmap->pm_stats.wired_count--;
2403
2404 /*
2405 * Remove extra pte reference
2406 */
2407 if (mpte)
2408 mpte->wire_count--;
2409
2410 /*
2411 * We might be turning off write access to the page,
2412 * so we go ahead and sense modify status.
2413 */
2414 if (origpte & PG_MANAGED) {
2415 om = m;
2416 pa |= PG_MANAGED;
2417 }
2418 goto validate;
2419 }
2420 /*
2421 * Mapping has changed, invalidate old range and fall through to
2422 * handle validating new mapping.
2423 */
2424 if (opa) {
2425 if (origpte & PG_W)
2426 pmap->pm_stats.wired_count--;
2427 if (origpte & PG_MANAGED) {
2428 om = PHYS_TO_VM_PAGE(opa);
2429 pmap_remove_entry(pmap, om, va);
2430 }
2431 if (mpte != NULL) {
2432 mpte->wire_count--;
2433 KASSERT(mpte->wire_count > 0,
2434 ("pmap_enter: missing reference to page table page,"
2435 " va: 0x%x", va));
2436 }
2437 } else
2438 pmap->pm_stats.resident_count++;
2439
2440 /*
2441 * Enter on the PV list if part of our managed memory.
2442 */
2443 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
2444 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
2445 ("pmap_enter: managed mapping within the clean submap"));
2446 pmap_insert_entry(pmap, va, m);
2447 pa |= PG_MANAGED;
2448 }
2449
2450 /*
2451 * Increment counters
2452 */
2453 if (wired)
2454 pmap->pm_stats.wired_count++;
2455
2456 validate:
2457 /*
2458 * Now validate mapping with desired protection/wiring.
2459 */
2460 newpte = (pt_entry_t)(pa | PG_V);
2461 if ((prot & VM_PROT_WRITE) != 0) {
2462 newpte |= PG_RW;
2463 vm_page_flag_set(m, PG_WRITEABLE);
2464 }
2465 #ifdef PAE
2466 if ((prot & VM_PROT_EXECUTE) == 0)
2467 newpte |= pg_nx;
2468 #endif
2469 if (wired)
2470 newpte |= PG_W;
2471 if (va < VM_MAXUSER_ADDRESS)
2472 newpte |= PG_U;
2473 if (pmap == kernel_pmap)
2474 newpte |= pgeflag;
2475
2476 /*
2477 * if the mapping or permission bits are different, we need
2478 * to update the pte.
2479 */
2480 if ((origpte & ~(PG_M|PG_A)) != newpte) {
2481 if (origpte & PG_V) {
2482 invlva = FALSE;
2483 origpte = pte_load_store(pte, newpte | PG_A);
2484 if (origpte & PG_A) {
2485 if (origpte & PG_MANAGED)
2486 vm_page_flag_set(om, PG_REFERENCED);
2487 if (opa != VM_PAGE_TO_PHYS(m))
2488 invlva = TRUE;
2489 #ifdef PAE
2490 if ((origpte & PG_NX) == 0 &&
2491 (newpte & PG_NX) != 0)
2492 invlva = TRUE;
2493 #endif
2494 }
2495 if (origpte & PG_M) {
2496 KASSERT((origpte & PG_RW),
2497 ("pmap_enter: modified page not writable: va: %#x, pte: %#jx",
2498 va, (uintmax_t)origpte));
2499 if ((origpte & PG_MANAGED) != 0)
2500 vm_page_dirty(om);
2501 if ((prot & VM_PROT_WRITE) == 0)
2502 invlva = TRUE;
2503 }
2504 if (invlva)
2505 pmap_invalidate_page(pmap, va);
2506 } else
2507 pte_store(pte, newpte | PG_A);
2508 }
2509 sched_unpin();
2510 vm_page_unlock_queues();
2511 PMAP_UNLOCK(pmap);
2512 }
2513
2514 /*
2515 * Maps a sequence of resident pages belonging to the same object.
2516 * The sequence begins with the given page m_start. This page is
2517 * mapped at the given virtual address start. Each subsequent page is
2518 * mapped at a virtual address that is offset from start by the same
2519 * amount as the page is offset from m_start within the object. The
2520 * last page in the sequence is the page with the largest offset from
2521 * m_start that can be mapped at a virtual address less than the given
2522 * virtual address end. Not every virtual page between start and end
2523 * is mapped; only those for which a resident page exists with the
2524 * corresponding offset from m_start are mapped.
2525 */
2526 void
2527 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
2528 vm_page_t m_start, vm_prot_t prot)
2529 {
2530 vm_page_t m, mpte;
2531 vm_pindex_t diff, psize;
2532
2533 VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
2534 psize = atop(end - start);
2535 mpte = NULL;
2536 m = m_start;
2537 PMAP_LOCK(pmap);
2538 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
2539 mpte = pmap_enter_quick_locked(pmap, start + ptoa(diff), m,
2540 prot, mpte);
2541 m = TAILQ_NEXT(m, listq);
2542 }
2543 PMAP_UNLOCK(pmap);
2544 }
2545
2546 /*
2547 * this code makes some *MAJOR* assumptions:
2548 * 1. Current pmap & pmap exists.
2549 * 2. Not wired.
2550 * 3. Read access.
2551 * 4. No page table pages.
2552 * but is *MUCH* faster than pmap_enter...
2553 */
2554
2555 void
2556 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
2557 {
2558
2559 PMAP_LOCK(pmap);
2560 (void) pmap_enter_quick_locked(pmap, va, m, prot, NULL);
2561 PMAP_UNLOCK(pmap);
2562 }
2563
2564 static vm_page_t
2565 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
2566 vm_prot_t prot, vm_page_t mpte)
2567 {
2568 pt_entry_t *pte;
2569 vm_paddr_t pa;
2570 vm_page_t free;
2571
2572 KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
2573 (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
2574 ("pmap_enter_quick_locked: managed mapping within the clean submap"));
2575 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
2576 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2577
2578 /*
2579 * In the case that a page table page is not
2580 * resident, we are creating it here.
2581 */
2582 if (va < VM_MAXUSER_ADDRESS) {
2583 unsigned ptepindex;
2584 pd_entry_t ptepa;
2585
2586 /*
2587 * Calculate pagetable page index
2588 */
2589 ptepindex = va >> PDRSHIFT;
2590 if (mpte && (mpte->pindex == ptepindex)) {
2591 mpte->wire_count++;
2592 } else {
2593 /*
2594 * Get the page directory entry
2595 */
2596 ptepa = pmap->pm_pdir[ptepindex];
2597
2598 /*
2599 * If the page table page is mapped, we just increment
2600 * the hold count, and activate it.
2601 */
2602 if (ptepa) {
2603 if (ptepa & PG_PS)
2604 panic("pmap_enter_quick: unexpected mapping into 4MB page");
2605 mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
2606 mpte->wire_count++;
2607 } else {
2608 mpte = _pmap_allocpte(pmap, ptepindex,
2609 M_NOWAIT);
2610 if (mpte == NULL)
2611 return (mpte);
2612 }
2613 }
2614 } else {
2615 mpte = NULL;
2616 }
2617
2618 /*
2619 * This call to vtopte makes the assumption that we are
2620 * entering the page into the current pmap. In order to support
2621 * quick entry into any pmap, one would likely use pmap_pte_quick.
2622 * But that isn't as quick as vtopte.
2623 */
2624 pte = vtopte(va);
2625 if (*pte) {
2626 if (mpte != NULL) {
2627 mpte->wire_count--;
2628 mpte = NULL;
2629 }
2630 return (mpte);
2631 }
2632
2633 /*
2634 * Enter on the PV list if part of our managed memory.
2635 */
2636 if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 &&
2637 !pmap_try_insert_pv_entry(pmap, va, m)) {
2638 if (mpte != NULL) {
2639 free = NULL;
2640 if (pmap_unwire_pte_hold(pmap, mpte, &free)) {
2641 pmap_invalidate_page(pmap, va);
2642 pmap_free_zero_pages(free);
2643 }
2644
2645 mpte = NULL;
2646 }
2647 return (mpte);
2648 }
2649
2650 /*
2651 * Increment counters
2652 */
2653 pmap->pm_stats.resident_count++;
2654
2655 pa = VM_PAGE_TO_PHYS(m);
2656 #ifdef PAE
2657 if ((prot & VM_PROT_EXECUTE) == 0)
2658 pa |= pg_nx;
2659 #endif
2660
2661 /*
2662 * Now validate mapping with RO protection
2663 */
2664 if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
2665 pte_store(pte, pa | PG_V | PG_U);
2666 else
2667 pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
2668 return mpte;
2669 }
2670
2671 /*
2672 * Make a temporary mapping for a physical address. This is only intended
2673 * to be used for panic dumps.
2674 */
2675 void *
2676 pmap_kenter_temporary(vm_paddr_t pa, int i)
2677 {
2678 vm_offset_t va;
2679
2680 va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
2681 pmap_kenter(va, pa);
2682 invlpg(va);
2683 return ((void *)crashdumpmap);
2684 }
2685
2686 /*
2687 * This code maps large physical mmap regions into the
2688 * processor address space. Note that some shortcuts
2689 * are taken, but the code works.
2690 */
2691 void
2692 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
2693 vm_object_t object, vm_pindex_t pindex,
2694 vm_size_t size)
2695 {
2696 vm_page_t p;
2697
2698 VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
2699 KASSERT(object->type == OBJT_DEVICE,
2700 ("pmap_object_init_pt: non-device object"));
2701 if (pseflag &&
2702 ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
2703 int i;
2704 vm_page_t m[1];
2705 unsigned int ptepindex;
2706 int npdes;
2707 pd_entry_t ptepa;
2708
2709 PMAP_LOCK(pmap);
2710 if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
2711 goto out;
2712 PMAP_UNLOCK(pmap);
2713 retry:
2714 p = vm_page_lookup(object, pindex);
2715 if (p != NULL) {
2716 if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
2717 goto retry;
2718 } else {
2719 p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
2720 if (p == NULL)
2721 return;
2722 m[0] = p;
2723
2724 if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
2725 vm_page_lock_queues();
2726 vm_page_free(p);
2727 vm_page_unlock_queues();
2728 return;
2729 }
2730
2731 p = vm_page_lookup(object, pindex);
2732 vm_page_lock_queues();
2733 vm_page_wakeup(p);
2734 vm_page_unlock_queues();
2735 }
2736
2737 ptepa = VM_PAGE_TO_PHYS(p);
2738 if (ptepa & (NBPDR - 1))
2739 return;
2740
2741 p->valid = VM_PAGE_BITS_ALL;
2742
2743 PMAP_LOCK(pmap);
2744 pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
2745 npdes = size >> PDRSHIFT;
2746 for(i = 0; i < npdes; i++) {
2747 pde_store(&pmap->pm_pdir[ptepindex],
2748 ptepa | PG_U | PG_RW | PG_V | PG_PS);
2749 ptepa += NBPDR;
2750 ptepindex += 1;
2751 }
2752 pmap_invalidate_all(pmap);
2753 out:
2754 PMAP_UNLOCK(pmap);
2755 }
2756 }
2757
2758 /*
2759 * Routine: pmap_change_wiring
2760 * Function: Change the wiring attribute for a map/virtual-address
2761 * pair.
2762 * In/out conditions:
2763 * The mapping must already exist in the pmap.
2764 */
2765 void
2766 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
2767 {
2768 pt_entry_t *pte;
2769
2770 PMAP_LOCK(pmap);
2771 pte = pmap_pte(pmap, va);
2772
2773 if (wired && !pmap_pte_w(pte))
2774 pmap->pm_stats.wired_count++;
2775 else if (!wired && pmap_pte_w(pte))
2776 pmap->pm_stats.wired_count--;
2777
2778 /*
2779 * Wiring is not a hardware characteristic so there is no need to
2780 * invalidate TLB.
2781 */
2782 pmap_pte_set_w(pte, wired);
2783 pmap_pte_release(pte);
2784 PMAP_UNLOCK(pmap);
2785 }
2786
2787
2788
2789 /*
2790 * Copy the range specified by src_addr/len
2791 * from the source map to the range dst_addr/len
2792 * in the destination map.
2793 *
2794 * This routine is only advisory and need not do anything.
2795 */
2796
2797 void
2798 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
2799 vm_offset_t src_addr)
2800 {
2801 vm_page_t free;
2802 vm_offset_t addr;
2803 vm_offset_t end_addr = src_addr + len;
2804 vm_offset_t pdnxt;
2805
2806 if (dst_addr != src_addr)
2807 return;
2808
2809 if (!pmap_is_current(src_pmap))
2810 return;
2811
2812 vm_page_lock_queues();
2813 if (dst_pmap < src_pmap) {
2814 PMAP_LOCK(dst_pmap);
2815 PMAP_LOCK(src_pmap);
2816 } else {
2817 PMAP_LOCK(src_pmap);
2818 PMAP_LOCK(dst_pmap);
2819 }
2820 sched_pin();
2821 for (addr = src_addr; addr < end_addr; addr = pdnxt) {
2822 pt_entry_t *src_pte, *dst_pte;
2823 vm_page_t dstmpte, srcmpte;
2824 pd_entry_t srcptepaddr;
2825 unsigned ptepindex;
2826
2827 if (addr >= UPT_MIN_ADDRESS)
2828 panic("pmap_copy: invalid to pmap_copy page tables");
2829
2830 pdnxt = (addr + NBPDR) & ~PDRMASK;
2831 ptepindex = addr >> PDRSHIFT;
2832
2833 srcptepaddr = src_pmap->pm_pdir[ptepindex];
2834 if (srcptepaddr == 0)
2835 continue;
2836
2837 if (srcptepaddr & PG_PS) {
2838 if (dst_pmap->pm_pdir[ptepindex] == 0) {
2839 dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
2840 ~PG_W;
2841 dst_pmap->pm_stats.resident_count +=
2842 NBPDR / PAGE_SIZE;
2843 }
2844 continue;
2845 }
2846
2847 srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
2848 if (srcmpte->wire_count == 0)
2849 panic("pmap_copy: source page table page is unused");
2850
2851 if (pdnxt > end_addr)
2852 pdnxt = end_addr;
2853
2854 src_pte = vtopte(addr);
2855 while (addr < pdnxt) {
2856 pt_entry_t ptetemp;
2857 ptetemp = *src_pte;
2858 /*
2859 * we only virtual copy managed pages
2860 */
2861 if ((ptetemp & PG_MANAGED) != 0) {
2862 dstmpte = pmap_allocpte(dst_pmap, addr,
2863 M_NOWAIT);
2864 if (dstmpte == NULL)
2865 break;
2866 dst_pte = pmap_pte_quick(dst_pmap, addr);
2867 if (*dst_pte == 0 &&
2868 pmap_try_insert_pv_entry(dst_pmap, addr,
2869 PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
2870 /*
2871 * Clear the wired, modified, and
2872 * accessed (referenced) bits
2873 * during the copy.
2874 */
2875 *dst_pte = ptetemp & ~(PG_W | PG_M |
2876 PG_A);
2877 dst_pmap->pm_stats.resident_count++;
2878 } else {
2879 free = NULL;
2880 if (pmap_unwire_pte_hold( dst_pmap,
2881 dstmpte, &free)) {
2882 pmap_invalidate_page(dst_pmap,
2883 addr);
2884 pmap_free_zero_pages(free);
2885 }
2886 }
2887 if (dstmpte->wire_count >= srcmpte->wire_count)
2888 break;
2889 }
2890 addr += PAGE_SIZE;
2891 src_pte++;
2892 }
2893 }
2894 sched_unpin();
2895 vm_page_unlock_queues();
2896 PMAP_UNLOCK(src_pmap);
2897 PMAP_UNLOCK(dst_pmap);
2898 }
2899
2900 static __inline void
2901 pagezero(void *page)
2902 {
2903 #if defined(I686_CPU)
2904 if (cpu_class == CPUCLASS_686) {
2905 #if defined(CPU_ENABLE_SSE)
2906 if (cpu_feature & CPUID_SSE2)
2907 sse2_pagezero(page);
2908 else
2909 #endif
2910 i686_pagezero(page);
2911 } else
2912 #endif
2913 bzero(page, PAGE_SIZE);
2914 }
2915
2916 /*
2917 * pmap_zero_page zeros the specified hardware page by mapping
2918 * the page into KVM and using bzero to clear its contents.
2919 */
2920 void
2921 pmap_zero_page(vm_page_t m)
2922 {
2923 struct sysmaps *sysmaps;
2924
2925 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
2926 mtx_lock(&sysmaps->lock);
2927 if (*sysmaps->CMAP2)
2928 panic("pmap_zero_page: CMAP2 busy");
2929 sched_pin();
2930 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2931 invlcaddr(sysmaps->CADDR2);
2932 pagezero(sysmaps->CADDR2);
2933 *sysmaps->CMAP2 = 0;
2934 sched_unpin();
2935 mtx_unlock(&sysmaps->lock);
2936 }
2937
2938 /*
2939 * pmap_zero_page_area zeros the specified hardware page by mapping
2940 * the page into KVM and using bzero to clear its contents.
2941 *
2942 * off and size may not cover an area beyond a single hardware page.
2943 */
2944 void
2945 pmap_zero_page_area(vm_page_t m, int off, int size)
2946 {
2947 struct sysmaps *sysmaps;
2948
2949 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
2950 mtx_lock(&sysmaps->lock);
2951 if (*sysmaps->CMAP2)
2952 panic("pmap_zero_page: CMAP2 busy");
2953 sched_pin();
2954 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2955 invlcaddr(sysmaps->CADDR2);
2956 if (off == 0 && size == PAGE_SIZE)
2957 pagezero(sysmaps->CADDR2);
2958 else
2959 bzero((char *)sysmaps->CADDR2 + off, size);
2960 *sysmaps->CMAP2 = 0;
2961 sched_unpin();
2962 mtx_unlock(&sysmaps->lock);
2963 }
2964
2965 /*
2966 * pmap_zero_page_idle zeros the specified hardware page by mapping
2967 * the page into KVM and using bzero to clear its contents. This
2968 * is intended to be called from the vm_pagezero process only and
2969 * outside of Giant.
2970 */
2971 void
2972 pmap_zero_page_idle(vm_page_t m)
2973 {
2974
2975 if (*CMAP3)
2976 panic("pmap_zero_page: CMAP3 busy");
2977 sched_pin();
2978 *CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
2979 invlcaddr(CADDR3);
2980 pagezero(CADDR3);
2981 *CMAP3 = 0;
2982 sched_unpin();
2983 }
2984
2985 /*
2986 * pmap_copy_page copies the specified (machine independent)
2987 * page by mapping the page into virtual memory and using
2988 * bcopy to copy the page, one machine dependent page at a
2989 * time.
2990 */
2991 void
2992 pmap_copy_page(vm_page_t src, vm_page_t dst)
2993 {
2994 struct sysmaps *sysmaps;
2995
2996 sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
2997 mtx_lock(&sysmaps->lock);
2998 if (*sysmaps->CMAP1)
2999 panic("pmap_copy_page: CMAP1 busy");
3000 if (*sysmaps->CMAP2)
3001 panic("pmap_copy_page: CMAP2 busy");
3002 sched_pin();
3003 invlpg((u_int)sysmaps->CADDR1);
3004 invlpg((u_int)sysmaps->CADDR2);
3005 *sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A;
3006 *sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M;
3007 bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
3008 *sysmaps->CMAP1 = 0;
3009 *sysmaps->CMAP2 = 0;
3010 sched_unpin();
3011 mtx_unlock(&sysmaps->lock);
3012 }
3013
3014 /*
3015 * Returns true if the pmap's pv is one of the first
3016 * 16 pvs linked to from this page. This count may
3017 * be changed upwards or downwards in the future; it
3018 * is only necessary that true be returned for a small
3019 * subset of pmaps for proper page aging.
3020 */
3021 boolean_t
3022 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
3023 {
3024 pv_entry_t pv;
3025 int loops = 0;
3026
3027 if (m->flags & PG_FICTITIOUS)
3028 return FALSE;
3029
3030 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3031 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3032 if (PV_PMAP(pv) == pmap) {
3033 return TRUE;
3034 }
3035 loops++;
3036 if (loops >= 16)
3037 break;
3038 }
3039 return (FALSE);
3040 }
3041
3042 /*
3043 * Remove all pages from specified address space
3044 * this aids process exit speeds. Also, this code
3045 * is special cased for current process only, but
3046 * can have the more generic (and slightly slower)
3047 * mode enabled. This is much faster than pmap_remove
3048 * in the case of running down an entire address space.
3049 */
3050 void
3051 pmap_remove_pages(pmap_t pmap)
3052 {
3053 pt_entry_t *pte, tpte;
3054 vm_page_t m, free = NULL;
3055 pv_entry_t pv;
3056 struct pv_chunk *pc, *npc;
3057 int field, idx;
3058 int32_t bit;
3059 uint32_t inuse, bitmask;
3060 int allfree;
3061
3062 if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
3063 printf("warning: pmap_remove_pages called with non-current pmap\n");
3064 return;
3065 }
3066 vm_page_lock_queues();
3067 PMAP_LOCK(pmap);
3068 sched_pin();
3069 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
3070 allfree = 1;
3071 for (field = 0; field < _NPCM; field++) {
3072 inuse = (~(pc->pc_map[field])) & pc_freemask[field];
3073 while (inuse != 0) {
3074 bit = bsfl(inuse);
3075 bitmask = 1UL << bit;
3076 idx = field * 32 + bit;
3077 pv = &pc->pc_pventry[idx];
3078 inuse &= ~bitmask;
3079
3080 pte = vtopte(pv->pv_va);
3081 tpte = *pte;
3082
3083 if (tpte == 0) {
3084 printf(
3085 "TPTE at %p IS ZERO @ VA %08x\n",
3086 pte, pv->pv_va);
3087 panic("bad pte");
3088 }
3089
3090 /*
3091 * We cannot remove wired pages from a process' mapping at this time
3092 */
3093 if (tpte & PG_W) {
3094 allfree = 0;
3095 continue;
3096 }
3097
3098 m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
3099 KASSERT(m->phys_addr == (tpte & PG_FRAME),
3100 ("vm_page_t %p phys_addr mismatch %016jx %016jx",
3101 m, (uintmax_t)m->phys_addr,
3102 (uintmax_t)tpte));
3103
3104 KASSERT(m < &vm_page_array[vm_page_array_size],
3105 ("pmap_remove_pages: bad tpte %#jx",
3106 (uintmax_t)tpte));
3107
3108 pmap->pm_stats.resident_count--;
3109
3110 pte_clear(pte);
3111
3112 /*
3113 * Update the vm_page_t clean/reference bits.
3114 */
3115 if (tpte & PG_M)
3116 vm_page_dirty(m);
3117
3118 /* Mark free */
3119 PV_STAT(pv_entry_frees++);
3120 PV_STAT(pv_entry_spare++);
3121 pv_entry_count--;
3122 pc->pc_map[field] |= bitmask;
3123 m->md.pv_list_count--;
3124 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3125 if (TAILQ_EMPTY(&m->md.pv_list))
3126 vm_page_flag_clear(m, PG_WRITEABLE);
3127
3128 pmap_unuse_pt(pmap, pv->pv_va, &free);
3129 }
3130 }
3131 if (allfree) {
3132 PV_STAT(pv_entry_spare -= _NPCPV);
3133 PV_STAT(pc_chunk_count--);
3134 PV_STAT(pc_chunk_frees++);
3135 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3136 m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
3137 pmap_qremove((vm_offset_t)pc, 1);
3138 vm_page_unwire(m, 0);
3139 vm_page_free(m);
3140 pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
3141 }
3142 }
3143 sched_unpin();
3144 pmap_invalidate_all(pmap);
3145 vm_page_unlock_queues();
3146 PMAP_UNLOCK(pmap);
3147 pmap_free_zero_pages(free);
3148 }
3149
3150 /*
3151 * pmap_is_modified:
3152 *
3153 * Return whether or not the specified physical page was modified
3154 * in any physical maps.
3155 */
3156 boolean_t
3157 pmap_is_modified(vm_page_t m)
3158 {
3159 pv_entry_t pv;
3160 pt_entry_t *pte;
3161 pmap_t pmap;
3162 boolean_t rv;
3163
3164 rv = FALSE;
3165 if (m->flags & PG_FICTITIOUS)
3166 return (rv);
3167
3168 sched_pin();
3169 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3170 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3171 pmap = PV_PMAP(pv);
3172 PMAP_LOCK(pmap);
3173 pte = pmap_pte_quick(pmap, pv->pv_va);
3174 rv = (*pte & PG_M) != 0;
3175 PMAP_UNLOCK(pmap);
3176 if (rv)
3177 break;
3178 }
3179 sched_unpin();
3180 return (rv);
3181 }
3182
3183 /*
3184 * pmap_is_prefaultable:
3185 *
3186 * Return whether or not the specified virtual address is elgible
3187 * for prefault.
3188 */
3189 boolean_t
3190 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
3191 {
3192 pt_entry_t *pte;
3193 boolean_t rv;
3194
3195 rv = FALSE;
3196 PMAP_LOCK(pmap);
3197 if (*pmap_pde(pmap, addr)) {
3198 pte = vtopte(addr);
3199 rv = *pte == 0;
3200 }
3201 PMAP_UNLOCK(pmap);
3202 return (rv);
3203 }
3204
3205 /*
3206 * Clear the write and modified bits in each of the given page's mappings.
3207 */
3208 void
3209 pmap_remove_write(vm_page_t m)
3210 {
3211 pv_entry_t pv;
3212 pmap_t pmap;
3213 pt_entry_t oldpte, *pte;
3214
3215 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3216 if ((m->flags & PG_FICTITIOUS) != 0 ||
3217 (m->flags & PG_WRITEABLE) == 0)
3218 return;
3219 sched_pin();
3220 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3221 pmap = PV_PMAP(pv);
3222 PMAP_LOCK(pmap);
3223 pte = pmap_pte_quick(pmap, pv->pv_va);
3224 retry:
3225 oldpte = *pte;
3226 if ((oldpte & PG_RW) != 0) {
3227 /*
3228 * Regardless of whether a pte is 32 or 64 bits
3229 * in size, PG_RW and PG_M are among the least
3230 * significant 32 bits.
3231 */
3232 if (!atomic_cmpset_int((u_int *)pte, oldpte,
3233 oldpte & ~(PG_RW | PG_M)))
3234 goto retry;
3235 if ((oldpte & PG_M) != 0)
3236 vm_page_dirty(m);
3237 pmap_invalidate_page(pmap, pv->pv_va);
3238 }
3239 PMAP_UNLOCK(pmap);
3240 }
3241 vm_page_flag_clear(m, PG_WRITEABLE);
3242 sched_unpin();
3243 }
3244
3245 /*
3246 * pmap_ts_referenced:
3247 *
3248 * Return a count of reference bits for a page, clearing those bits.
3249 * It is not necessary for every reference bit to be cleared, but it
3250 * is necessary that 0 only be returned when there are truly no
3251 * reference bits set.
3252 *
3253 * XXX: The exact number of bits to check and clear is a matter that
3254 * should be tested and standardized at some point in the future for
3255 * optimal aging of shared pages.
3256 */
3257 int
3258 pmap_ts_referenced(vm_page_t m)
3259 {
3260 pv_entry_t pv, pvf, pvn;
3261 pmap_t pmap;
3262 pt_entry_t *pte;
3263 int rtval = 0;
3264
3265 if (m->flags & PG_FICTITIOUS)
3266 return (rtval);
3267 sched_pin();
3268 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3269 if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3270 pvf = pv;
3271 do {
3272 pvn = TAILQ_NEXT(pv, pv_list);
3273 TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
3274 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
3275 pmap = PV_PMAP(pv);
3276 PMAP_LOCK(pmap);
3277 pte = pmap_pte_quick(pmap, pv->pv_va);
3278 if ((*pte & PG_A) != 0) {
3279 atomic_clear_int((u_int *)pte, PG_A);
3280 pmap_invalidate_page(pmap, pv->pv_va);
3281 rtval++;
3282 if (rtval > 4)
3283 pvn = NULL;
3284 }
3285 PMAP_UNLOCK(pmap);
3286 } while ((pv = pvn) != NULL && pv != pvf);
3287 }
3288 sched_unpin();
3289 return (rtval);
3290 }
3291
3292 /*
3293 * Clear the modify bits on the specified physical page.
3294 */
3295 void
3296 pmap_clear_modify(vm_page_t m)
3297 {
3298 pv_entry_t pv;
3299 pmap_t pmap;
3300 pt_entry_t *pte;
3301
3302 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3303 if ((m->flags & PG_FICTITIOUS) != 0)
3304 return;
3305 sched_pin();
3306 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3307 pmap = PV_PMAP(pv);
3308 PMAP_LOCK(pmap);
3309 pte = pmap_pte_quick(pmap, pv->pv_va);
3310 if ((*pte & PG_M) != 0) {
3311 /*
3312 * Regardless of whether a pte is 32 or 64 bits
3313 * in size, PG_M is among the least significant
3314 * 32 bits.
3315 */
3316 atomic_clear_int((u_int *)pte, PG_M);
3317 pmap_invalidate_page(pmap, pv->pv_va);
3318 }
3319 PMAP_UNLOCK(pmap);
3320 }
3321 sched_unpin();
3322 }
3323
3324 /*
3325 * pmap_clear_reference:
3326 *
3327 * Clear the reference bit on the specified physical page.
3328 */
3329 void
3330 pmap_clear_reference(vm_page_t m)
3331 {
3332 pv_entry_t pv;
3333 pmap_t pmap;
3334 pt_entry_t *pte;
3335
3336 mtx_assert(&vm_page_queue_mtx, MA_OWNED);
3337 if ((m->flags & PG_FICTITIOUS) != 0)
3338 return;
3339 sched_pin();
3340 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3341 pmap = PV_PMAP(pv);
3342 PMAP_LOCK(pmap);
3343 pte = pmap_pte_quick(pmap, pv->pv_va);
3344 if ((*pte & PG_A) != 0) {
3345 /*
3346 * Regardless of whether a pte is 32 or 64 bits
3347 * in size, PG_A is among the least significant
3348 * 32 bits.
3349 */
3350 atomic_clear_int((u_int *)pte, PG_A);
3351 pmap_invalidate_page(pmap, pv->pv_va);
3352 }
3353 PMAP_UNLOCK(pmap);
3354 }
3355 sched_unpin();
3356 }
3357
3358 /*
3359 * Miscellaneous support routines follow
3360 */
3361
3362 /*
3363 * Map a set of physical memory pages into the kernel virtual
3364 * address space. Return a pointer to where it is mapped. This
3365 * routine is intended to be used for mapping device memory,
3366 * NOT real memory.
3367 */
3368 void *
3369 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
3370 {
3371 vm_offset_t va, tmpva, offset;
3372
3373 offset = pa & PAGE_MASK;
3374 size = roundup(offset + size, PAGE_SIZE);
3375 pa = pa & PG_FRAME;
3376
3377 if (pa < KERNLOAD && pa + size <= KERNLOAD)
3378 va = KERNBASE + pa;
3379 else
3380 va = kmem_alloc_nofault(kernel_map, size);
3381 if (!va)
3382 panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
3383
3384 for (tmpva = va; size > 0; ) {
3385 pmap_kenter_attr(tmpva, pa, mode);
3386 size -= PAGE_SIZE;
3387 tmpva += PAGE_SIZE;
3388 pa += PAGE_SIZE;
3389 }
3390 pmap_invalidate_range(kernel_pmap, va, tmpva);
3391 pmap_invalidate_cache();
3392 return ((void *)(va + offset));
3393 }
3394
3395 void *
3396 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
3397 {
3398
3399 return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
3400 }
3401
3402 void *
3403 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
3404 {
3405
3406 return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
3407 }
3408
3409 void
3410 pmap_unmapdev(vm_offset_t va, vm_size_t size)
3411 {
3412 vm_offset_t base, offset, tmpva;
3413
3414 if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
3415 return;
3416 base = trunc_page(va);
3417 offset = va & PAGE_MASK;
3418 size = roundup(offset + size, PAGE_SIZE);
3419 for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
3420 pmap_kremove(tmpva);
3421 pmap_invalidate_range(kernel_pmap, va, tmpva);
3422 kmem_free(kernel_map, base, size);
3423 }
3424
3425 int
3426 pmap_change_attr(va, size, mode)
3427 vm_offset_t va;
3428 vm_size_t size;
3429 int mode;
3430 {
3431 vm_offset_t base, offset, tmpva;
3432 pt_entry_t *pte;
3433 u_int opte, npte;
3434 pd_entry_t *pde;
3435
3436 base = trunc_page(va);
3437 offset = va & PAGE_MASK;
3438 size = roundup(offset + size, PAGE_SIZE);
3439
3440 /* Only supported on kernel virtual addresses. */
3441 if (base <= VM_MAXUSER_ADDRESS)
3442 return (EINVAL);
3443
3444 /* 4MB pages and pages that aren't mapped aren't supported. */
3445 for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) {
3446 pde = pmap_pde(kernel_pmap, tmpva);
3447 if (*pde & PG_PS)
3448 return (EINVAL);
3449 if (*pde == 0)
3450 return (EINVAL);
3451 pte = vtopte(va);
3452 if (*pte == 0)
3453 return (EINVAL);
3454 }
3455
3456 /*
3457 * Ok, all the pages exist and are 4k, so run through them updating
3458 * their cache mode.
3459 */
3460 for (tmpva = base; size > 0; ) {
3461 pte = vtopte(tmpva);
3462
3463 /*
3464 * The cache mode bits are all in the low 32-bits of the
3465 * PTE, so we can just spin on updating the low 32-bits.
3466 */
3467 do {
3468 opte = *(u_int *)pte;
3469 npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT);
3470 npte |= pmap_cache_bits(mode, 0);
3471 } while (npte != opte &&
3472 !atomic_cmpset_int((u_int *)pte, opte, npte));
3473 tmpva += PAGE_SIZE;
3474 size -= PAGE_SIZE;
3475 }
3476
3477 /*
3478 * Flush CPU caches to make sure any data isn't cached that shouldn't
3479 * be, etc.
3480 */
3481 pmap_invalidate_range(kernel_pmap, base, tmpva);
3482 pmap_invalidate_cache();
3483 return (0);
3484 }
3485
3486 /*
3487 * perform the pmap work for mincore
3488 */
3489 int
3490 pmap_mincore(pmap_t pmap, vm_offset_t addr)
3491 {
3492 pt_entry_t *ptep, pte;
3493 vm_page_t m;
3494 int val = 0;
3495
3496 PMAP_LOCK(pmap);
3497 ptep = pmap_pte(pmap, addr);
3498 pte = (ptep != NULL) ? *ptep : 0;
3499 pmap_pte_release(ptep);
3500 PMAP_UNLOCK(pmap);
3501
3502 if (pte != 0) {
3503 vm_paddr_t pa;
3504
3505 val = MINCORE_INCORE;
3506 if ((pte & PG_MANAGED) == 0)
3507 return val;
3508
3509 pa = pte & PG_FRAME;
3510
3511 m = PHYS_TO_VM_PAGE(pa);
3512
3513 /*
3514 * Modified by us
3515 */
3516 if (pte & PG_M)
3517 val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
3518 else {
3519 /*
3520 * Modified by someone else
3521 */
3522 vm_page_lock_queues();
3523 if (m->dirty || pmap_is_modified(m))
3524 val |= MINCORE_MODIFIED_OTHER;
3525 vm_page_unlock_queues();
3526 }
3527 /*
3528 * Referenced by us
3529 */
3530 if (pte & PG_A)
3531 val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
3532 else {
3533 /*
3534 * Referenced by someone else
3535 */
3536 vm_page_lock_queues();
3537 if ((m->flags & PG_REFERENCED) ||
3538 pmap_ts_referenced(m)) {
3539 val |= MINCORE_REFERENCED_OTHER;
3540 vm_page_flag_set(m, PG_REFERENCED);
3541 }
3542 vm_page_unlock_queues();
3543 }
3544 }
3545 return val;
3546 }
3547
3548 void
3549 pmap_activate(struct thread *td)
3550 {
3551 pmap_t pmap, oldpmap;
3552 u_int32_t cr3;
3553
3554 critical_enter();
3555 pmap = vmspace_pmap(td->td_proc->p_vmspace);
3556 oldpmap = PCPU_GET(curpmap);
3557 #if defined(SMP)
3558 atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
3559 atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
3560 #else
3561 oldpmap->pm_active &= ~1;
3562 pmap->pm_active |= 1;
3563 #endif
3564 #ifdef PAE
3565 cr3 = vtophys(pmap->pm_pdpt);
3566 #else
3567 cr3 = vtophys(pmap->pm_pdir);
3568 #endif
3569 /*
3570 * pmap_activate is for the current thread on the current cpu
3571 */
3572 td->td_pcb->pcb_cr3 = cr3;
3573 load_cr3(cr3);
3574 PCPU_SET(curpmap, pmap);
3575 critical_exit();
3576 }
3577
3578 vm_offset_t
3579 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
3580 {
3581
3582 if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
3583 return addr;
3584 }
3585
3586 addr = (addr + PDRMASK) & ~PDRMASK;
3587 return addr;
3588 }
3589
3590
3591 #if defined(PMAP_DEBUG)
3592 pmap_pid_dump(int pid)
3593 {
3594 pmap_t pmap;
3595 struct proc *p;
3596 int npte = 0;
3597 int index;
3598
3599 sx_slock(&allproc_lock);
3600 FOREACH_PROC_IN_SYSTEM(p) {
3601 if (p->p_pid != pid)
3602 continue;
3603
3604 if (p->p_vmspace) {
3605 int i,j;
3606 index = 0;
3607 pmap = vmspace_pmap(p->p_vmspace);
3608 for (i = 0; i < NPDEPTD; i++) {
3609 pd_entry_t *pde;
3610 pt_entry_t *pte;
3611 vm_offset_t base = i << PDRSHIFT;
3612
3613 pde = &pmap->pm_pdir[i];
3614 if (pde && pmap_pde_v(pde)) {
3615 for (j = 0; j < NPTEPG; j++) {
3616 vm_offset_t va = base + (j << PAGE_SHIFT);
3617 if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
3618 if (index) {
3619 index = 0;
3620 printf("\n");
3621 }
3622 sx_sunlock(&allproc_lock);
3623 return npte;
3624 }
3625 pte = pmap_pte(pmap, va);
3626 if (pte && pmap_pte_v(pte)) {
3627 pt_entry_t pa;
3628 vm_page_t m;
3629 pa = *pte;
3630 m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
3631 printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
3632 va, pa, m->hold_count, m->wire_count, m->flags);
3633 npte++;
3634 index++;
3635 if (index >= 2) {
3636 index = 0;
3637 printf("\n");
3638 } else {
3639 printf(" ");
3640 }
3641 }
3642 }
3643 }
3644 }
3645 }
3646 }
3647 sx_sunlock(&allproc_lock);
3648 return npte;
3649 }
3650 #endif
3651
3652 #if defined(DEBUG)
3653
3654 static void pads(pmap_t pm);
3655 void pmap_pvdump(vm_offset_t pa);
3656
3657 /* print address space of pmap*/
3658 static void
3659 pads(pmap_t pm)
3660 {
3661 int i, j;
3662 vm_paddr_t va;
3663 pt_entry_t *ptep;
3664
3665 if (pm == kernel_pmap)
3666 return;
3667 for (i = 0; i < NPDEPTD; i++)
3668 if (pm->pm_pdir[i])
3669 for (j = 0; j < NPTEPG; j++) {
3670 va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
3671 if (pm == kernel_pmap && va < KERNBASE)
3672 continue;
3673 if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
3674 continue;
3675 ptep = pmap_pte(pm, va);
3676 if (pmap_pte_v(ptep))
3677 printf("%x:%x ", va, *ptep);
3678 };
3679
3680 }
3681
3682 void
3683 pmap_pvdump(vm_paddr_t pa)
3684 {
3685 pv_entry_t pv;
3686 pmap_t pmap;
3687 vm_page_t m;
3688
3689 printf("pa %x", pa);
3690 m = PHYS_TO_VM_PAGE(pa);
3691 TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
3692 pmap = PV_PMAP(pv);
3693 printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);
3694 pads(pmap);
3695 }
3696 printf(" ");
3697 }
3698 #endif
Cache object: 08e18ea9ecead7705e59b36c1c59cc78
|