FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_page.c
1 /*
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * The Mach Operating System project at Carnegie-Mellon University.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91
37 * $FreeBSD$
38 */
39
40 /*
41 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
42 * All rights reserved.
43 *
44 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
45 *
46 * Permission to use, copy, modify and distribute this software and
47 * its documentation is hereby granted, provided that both the copyright
48 * notice and this permission notice appear in all copies of the
49 * software, derivative works or modified versions, and any portions
50 * thereof, and that both notices appear in supporting documentation.
51 *
52 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
53 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
54 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
55 *
56 * Carnegie Mellon requests users of this software to return to
57 *
58 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
59 * School of Computer Science
60 * Carnegie Mellon University
61 * Pittsburgh PA 15213-3890
62 *
63 * any improvements or extensions that they make and grant Carnegie the
64 * rights to redistribute these changes.
65 */
66
67 /*
68 * Resident memory management module.
69 */
70
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/malloc.h>
74 #include <sys/proc.h>
75 #include <sys/vmmeter.h>
76 #include <sys/vnode.h>
77
78 #include <vm/vm.h>
79 #include <vm/vm_param.h>
80 #include <sys/lock.h>
81 #include <vm/vm_kern.h>
82 #include <vm/pmap.h>
83 #include <vm/vm_map.h>
84 #include <vm/vm_object.h>
85 #include <vm/vm_page.h>
86 #include <vm/vm_pageout.h>
87 #include <vm/vm_pager.h>
88 #include <vm/vm_extern.h>
89
90 static void vm_page_queue_init (void);
91 static vm_page_t vm_page_select_cache (vm_object_t, vm_pindex_t);
92
93 /*
94 * Associated with page of user-allocatable memory is a
95 * page structure.
96 */
97
98 static struct vm_page **vm_page_buckets; /* Array of buckets */
99 static int vm_page_bucket_count; /* How big is array? */
100 static int vm_page_hash_mask; /* Mask for hash function */
101 static volatile int vm_page_bucket_generation;
102
103 struct vpgqueues vm_page_queues[PQ_COUNT];
104
105 static void
106 vm_page_queue_init(void) {
107 int i;
108
109 for(i=0;i<PQ_L2_SIZE;i++) {
110 vm_page_queues[PQ_FREE+i].cnt = &cnt.v_free_count;
111 }
112 vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count;
113
114 vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count;
115 vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count;
116 for(i=0;i<PQ_L2_SIZE;i++) {
117 vm_page_queues[PQ_CACHE+i].cnt = &cnt.v_cache_count;
118 }
119 for(i=0;i<PQ_COUNT;i++) {
120 TAILQ_INIT(&vm_page_queues[i].pl);
121 }
122 }
123
124 vm_page_t vm_page_array = 0;
125 int vm_page_array_size = 0;
126 long first_page = 0;
127 int vm_page_zero_count = 0;
128
129 static __inline int vm_page_hash (vm_object_t object, vm_pindex_t pindex);
130 static void vm_page_free_wakeup (void);
131
132 /*
133 * vm_set_page_size:
134 *
135 * Sets the page size, perhaps based upon the memory
136 * size. Must be called before any use of page-size
137 * dependent functions.
138 */
139 void
140 vm_set_page_size(void)
141 {
142 if (cnt.v_page_size == 0)
143 cnt.v_page_size = PAGE_SIZE;
144 if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0)
145 panic("vm_set_page_size: page size not a power of two");
146 }
147
148 /*
149 * vm_add_new_page:
150 *
151 * Add a new page to the freelist for use by the system.
152 * Must be called at splhigh().
153 */
154 vm_page_t
155 vm_add_new_page(vm_paddr_t pa)
156 {
157 vm_page_t m;
158
159 ++cnt.v_page_count;
160 ++cnt.v_free_count;
161 m = PHYS_TO_VM_PAGE(pa);
162 m->phys_addr = pa;
163 m->flags = 0;
164 m->pc = (pa >> PAGE_SHIFT) & PQ_L2_MASK;
165 m->queue = m->pc + PQ_FREE;
166 TAILQ_INSERT_HEAD(&vm_page_queues[m->queue].pl, m, pageq);
167 vm_page_queues[m->queue].lcnt++;
168 return (m);
169 }
170
171 /*
172 * vm_page_startup:
173 *
174 * Initializes the resident memory module.
175 *
176 * Allocates memory for the page cells, and
177 * for the object/offset-to-page hash table headers.
178 * Each page cell is initialized and placed on the free list.
179 */
180
181 vm_offset_t
182 vm_page_startup(vm_offset_t starta, vm_offset_t enda, vm_offset_t vaddr)
183 {
184 vm_offset_t mapped;
185 struct vm_page **bucket;
186 vm_size_t npages;
187 vm_paddr_t page_range;
188 vm_paddr_t new_end;
189 int i;
190 vm_paddr_t pa;
191 int nblocks;
192 vm_paddr_t last_pa;
193
194 /* the biggest memory array is the second group of pages */
195 vm_paddr_t end;
196 vm_paddr_t biggestone, biggestsize;
197
198 vm_paddr_t total;
199
200 total = 0;
201 biggestsize = 0;
202 biggestone = 0;
203 nblocks = 0;
204 vaddr = round_page(vaddr);
205
206 for (i = 0; phys_avail[i + 1]; i += 2) {
207 phys_avail[i] = round_page(phys_avail[i]);
208 phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
209 }
210
211 for (i = 0; phys_avail[i + 1]; i += 2) {
212 vm_paddr_t size = phys_avail[i + 1] - phys_avail[i];
213
214 if (size > biggestsize) {
215 biggestone = i;
216 biggestsize = size;
217 }
218 ++nblocks;
219 total += size;
220 }
221
222 end = phys_avail[biggestone+1];
223
224 /*
225 * Initialize the queue headers for the free queue, the active queue
226 * and the inactive queue.
227 */
228
229 vm_page_queue_init();
230
231 /*
232 * Allocate (and initialize) the hash table buckets.
233 *
234 * The number of buckets MUST BE a power of 2, and the actual value is
235 * the next power of 2 greater than the number of physical pages in
236 * the system.
237 *
238 * We make the hash table approximately 2x the number of pages to
239 * reduce the chain length. This is about the same size using the
240 * singly-linked list as the 1x hash table we were using before
241 * using TAILQ but the chain length will be smaller.
242 *
243 * Note: This computation can be tweaked if desired.
244 */
245 vm_page_buckets = (struct vm_page **)vaddr;
246 bucket = vm_page_buckets;
247 if (vm_page_bucket_count == 0) {
248 vm_page_bucket_count = 1;
249 while (vm_page_bucket_count < atop(total))
250 vm_page_bucket_count <<= 1;
251 }
252 vm_page_bucket_count <<= 1;
253 vm_page_hash_mask = vm_page_bucket_count - 1;
254
255 /*
256 * Validate these addresses.
257 */
258 new_end = end - vm_page_bucket_count * sizeof(struct vm_page *);
259 new_end = trunc_page(new_end);
260 mapped = round_page(vaddr);
261 vaddr = pmap_map(mapped, new_end, end,
262 VM_PROT_READ | VM_PROT_WRITE);
263 vaddr = round_page(vaddr);
264 bzero((caddr_t) mapped, vaddr - mapped);
265
266 for (i = 0; i < vm_page_bucket_count; i++) {
267 *bucket = NULL;
268 bucket++;
269 }
270
271 /*
272 * Compute the number of pages of memory that will be available for
273 * use (taking into account the overhead of a page structure per
274 * page).
275 */
276
277 first_page = phys_avail[0] / PAGE_SIZE;
278
279 page_range = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE - first_page;
280 npages = (total - (page_range * sizeof(struct vm_page)) -
281 (end - new_end)) / PAGE_SIZE;
282
283 end = new_end;
284 /*
285 * Initialize the mem entry structures now, and put them in the free
286 * queue.
287 */
288 vm_page_array = (vm_page_t) vaddr;
289 mapped = vaddr;
290
291 /*
292 * Validate these addresses.
293 */
294
295 new_end = trunc_page(end - page_range * sizeof(struct vm_page));
296 mapped = pmap_map(mapped, new_end, end,
297 VM_PROT_READ | VM_PROT_WRITE);
298
299 /*
300 * Clear all of the page structures
301 */
302 bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
303 vm_page_array_size = page_range;
304
305 /*
306 * Construct the free queue(s) in descending order (by physical
307 * address) so that the first 16MB of physical memory is allocated
308 * last rather than first. On large-memory machines, this avoids
309 * the exhaustion of low physical memory before isa_dmainit has run.
310 */
311 cnt.v_page_count = 0;
312 cnt.v_free_count = 0;
313 for (i = 0; phys_avail[i + 1] && npages > 0; i += 2) {
314 pa = phys_avail[i];
315 if (i == biggestone)
316 last_pa = new_end;
317 else
318 last_pa = phys_avail[i + 1];
319 while (pa < last_pa && npages-- > 0) {
320 vm_add_new_page(pa);
321 pa += PAGE_SIZE;
322 }
323 }
324 return (mapped);
325 }
326
327 /*
328 * vm_page_hash:
329 *
330 * Distributes the object/offset key pair among hash buckets.
331 *
332 * NOTE: This macro depends on vm_page_bucket_count being a power of 2.
333 * This routine may not block.
334 *
335 * We try to randomize the hash based on the object to spread the pages
336 * out in the hash table without it costing us too much.
337 */
338 static __inline int
339 vm_page_hash(vm_object_t object, vm_pindex_t pindex)
340 {
341 int i = ((uintptr_t)object + pindex) ^ object->hash_rand;
342
343 return(i & vm_page_hash_mask);
344 }
345
346 void
347 vm_page_unhold(vm_page_t mem)
348 {
349 --mem->hold_count;
350 KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!"));
351 if (mem->hold_count == 0 && mem->queue == PQ_HOLD)
352 vm_page_free_toq(mem);
353 }
354
355 /*
356 * vm_page_insert: [ internal use only ]
357 *
358 * Inserts the given mem entry into the object and object list.
359 *
360 * The pagetables are not updated but will presumably fault the page
361 * in if necessary, or if a kernel page the caller will at some point
362 * enter the page into the kernel's pmap. We are not allowed to block
363 * here so we *can't* do this anyway.
364 *
365 * The object and page must be locked, and must be splhigh.
366 * This routine may not block.
367 */
368
369 void
370 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
371 {
372 struct vm_page **bucket;
373
374 if (m->object != NULL)
375 panic("vm_page_insert: already inserted");
376
377 /*
378 * Record the object/offset pair in this page
379 */
380
381 m->object = object;
382 m->pindex = pindex;
383
384 /*
385 * Insert it into the object_object/offset hash table
386 */
387
388 bucket = &vm_page_buckets[vm_page_hash(object, pindex)];
389 m->hnext = *bucket;
390 *bucket = m;
391 vm_page_bucket_generation++;
392
393 /*
394 * Now link into the object's list of backed pages.
395 */
396
397 TAILQ_INSERT_TAIL(&object->memq, m, listq);
398 object->generation++;
399
400 /*
401 * show that the object has one more resident page.
402 */
403
404 object->resident_page_count++;
405
406 /*
407 * Since we are inserting a new and possibly dirty page,
408 * update the object's OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY flags.
409 */
410 if (m->flags & PG_WRITEABLE)
411 vm_object_set_writeable_dirty(object);
412 }
413
414 /*
415 * vm_page_remove:
416 * NOTE: used by device pager as well -wfj
417 *
418 * Removes the given mem entry from the object/offset-page
419 * table and the object page list, but do not invalidate/terminate
420 * the backing store.
421 *
422 * The object and page must be locked, and at splhigh.
423 * The underlying pmap entry (if any) is NOT removed here.
424 * This routine may not block.
425 */
426
427 void
428 vm_page_remove(vm_page_t m)
429 {
430 vm_object_t object;
431
432 if (m->object == NULL)
433 return;
434
435 if ((m->flags & PG_BUSY) == 0) {
436 panic("vm_page_remove: page not busy");
437 }
438
439 /*
440 * Basically destroy the page.
441 */
442
443 vm_page_wakeup(m);
444
445 object = m->object;
446
447 /*
448 * Remove from the object_object/offset hash table. The object
449 * must be on the hash queue, we will panic if it isn't
450 *
451 * Note: we must NULL-out m->hnext to prevent loops in detached
452 * buffers with vm_page_lookup().
453 */
454
455 {
456 struct vm_page **bucket;
457
458 bucket = &vm_page_buckets[vm_page_hash(m->object, m->pindex)];
459 while (*bucket != m) {
460 if (*bucket == NULL)
461 panic("vm_page_remove(): page not found in hash");
462 bucket = &(*bucket)->hnext;
463 }
464 *bucket = m->hnext;
465 m->hnext = NULL;
466 vm_page_bucket_generation++;
467 }
468
469 /*
470 * Now remove from the object's list of backed pages.
471 */
472
473 TAILQ_REMOVE(&object->memq, m, listq);
474
475 /*
476 * And show that the object has one fewer resident page.
477 */
478
479 object->resident_page_count--;
480 object->generation++;
481
482 m->object = NULL;
483 }
484
485 /*
486 * vm_page_lookup:
487 *
488 * Returns the page associated with the object/offset
489 * pair specified; if none is found, NULL is returned.
490 *
491 * NOTE: the code below does not lock. It will operate properly if
492 * an interrupt makes a change, but the generation algorithm will not
493 * operate properly in an SMP environment where both cpu's are able to run
494 * kernel code simultaneously.
495 *
496 * The object must be locked. No side effects.
497 * This routine may not block.
498 * This is a critical path routine
499 */
500
501 vm_page_t
502 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
503 {
504 vm_page_t m;
505 struct vm_page **bucket;
506 int generation;
507
508 /*
509 * Search the hash table for this object/offset pair
510 */
511
512 retry:
513 generation = vm_page_bucket_generation;
514 bucket = &vm_page_buckets[vm_page_hash(object, pindex)];
515 for (m = *bucket; m != NULL; m = m->hnext) {
516 if ((m->object == object) && (m->pindex == pindex)) {
517 if (vm_page_bucket_generation != generation)
518 goto retry;
519 return (m);
520 }
521 }
522 if (vm_page_bucket_generation != generation)
523 goto retry;
524 return (NULL);
525 }
526
527 /*
528 * vm_page_rename:
529 *
530 * Move the given memory entry from its
531 * current object to the specified target object/offset.
532 *
533 * The object must be locked.
534 * This routine may not block.
535 *
536 * Note: this routine will raise itself to splvm(), the caller need not.
537 *
538 * Note: swap associated with the page must be invalidated by the move. We
539 * have to do this for several reasons: (1) we aren't freeing the
540 * page, (2) we are dirtying the page, (3) the VM system is probably
541 * moving the page from object A to B, and will then later move
542 * the backing store from A to B and we can't have a conflict.
543 *
544 * Note: we *always* dirty the page. It is necessary both for the
545 * fact that we moved it, and because we may be invalidating
546 * swap. If the page is on the cache, we have to deactivate it
547 * or vm_page_dirty() will panic. Dirty pages are not allowed
548 * on the cache.
549 */
550
551 void
552 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
553 {
554 int s;
555
556 s = splvm();
557 vm_page_remove(m);
558 vm_page_insert(m, new_object, new_pindex);
559 if (m->queue - m->pc == PQ_CACHE)
560 vm_page_deactivate(m);
561 vm_page_dirty(m);
562 splx(s);
563 }
564
565 /*
566 * vm_page_unqueue_nowakeup:
567 *
568 * vm_page_unqueue() without any wakeup
569 *
570 * This routine must be called at splhigh().
571 * This routine may not block.
572 */
573
574 void
575 vm_page_unqueue_nowakeup(vm_page_t m)
576 {
577 int queue = m->queue;
578 struct vpgqueues *pq;
579 if (queue != PQ_NONE) {
580 pq = &vm_page_queues[queue];
581 m->queue = PQ_NONE;
582 TAILQ_REMOVE(&pq->pl, m, pageq);
583 (*pq->cnt)--;
584 pq->lcnt--;
585 }
586 }
587
588 /*
589 * vm_page_unqueue:
590 *
591 * Remove a page from its queue.
592 *
593 * This routine must be called at splhigh().
594 * This routine may not block.
595 */
596
597 void
598 vm_page_unqueue(vm_page_t m)
599 {
600 int queue = m->queue;
601 struct vpgqueues *pq;
602 if (queue != PQ_NONE) {
603 m->queue = PQ_NONE;
604 pq = &vm_page_queues[queue];
605 TAILQ_REMOVE(&pq->pl, m, pageq);
606 (*pq->cnt)--;
607 pq->lcnt--;
608 if ((queue - m->pc) == PQ_CACHE) {
609 if (vm_paging_needed())
610 pagedaemon_wakeup();
611 }
612 }
613 }
614
615 #if PQ_L2_SIZE > 1
616
617 /*
618 * vm_page_list_find:
619 *
620 * Find a page on the specified queue with color optimization.
621 *
622 * The page coloring optimization attempts to locate a page
623 * that does not overload other nearby pages in the object in
624 * the cpu's L1 or L2 caches. We need this optimization because
625 * cpu caches tend to be physical caches, while object spaces tend
626 * to be virtual.
627 *
628 * This routine must be called at splvm().
629 * This routine may not block.
630 *
631 * This routine may only be called from the vm_page_list_find() macro
632 * in vm_page.h
633 */
634 vm_page_t
635 _vm_page_list_find(int basequeue, int index)
636 {
637 int i;
638 vm_page_t m = NULL;
639 struct vpgqueues *pq;
640
641 pq = &vm_page_queues[basequeue];
642
643 /*
644 * Note that for the first loop, index+i and index-i wind up at the
645 * same place. Even though this is not totally optimal, we've already
646 * blown it by missing the cache case so we do not care.
647 */
648
649 for(i = PQ_L2_SIZE / 2; i > 0; --i) {
650 if ((m = TAILQ_FIRST(&pq[(index + i) & PQ_L2_MASK].pl)) != NULL)
651 break;
652
653 if ((m = TAILQ_FIRST(&pq[(index - i) & PQ_L2_MASK].pl)) != NULL)
654 break;
655 }
656 return(m);
657 }
658
659 #endif
660
661 /*
662 * vm_page_select_cache:
663 *
664 * Find a page on the cache queue with color optimization. As pages
665 * might be found, but not applicable, they are deactivated. This
666 * keeps us from using potentially busy cached pages.
667 *
668 * This routine must be called at splvm().
669 * This routine may not block.
670 */
671 vm_page_t
672 vm_page_select_cache(vm_object_t object, vm_pindex_t pindex)
673 {
674 vm_page_t m;
675
676 while (TRUE) {
677 m = vm_page_list_find(
678 PQ_CACHE,
679 (pindex + object->pg_color) & PQ_L2_MASK,
680 FALSE
681 );
682 if (m && ((m->flags & (PG_BUSY|PG_UNMANAGED)) || m->busy ||
683 m->hold_count || m->wire_count)) {
684 vm_page_deactivate(m);
685 continue;
686 }
687 return m;
688 }
689 }
690
691 /*
692 * vm_page_select_free:
693 *
694 * Find a free or zero page, with specified preference. We attempt to
695 * inline the nominal case and fall back to _vm_page_select_free()
696 * otherwise.
697 *
698 * This routine must be called at splvm().
699 * This routine may not block.
700 */
701
702 static __inline vm_page_t
703 vm_page_select_free(vm_object_t object, vm_pindex_t pindex, boolean_t prefer_zero)
704 {
705 vm_page_t m;
706
707 m = vm_page_list_find(
708 PQ_FREE,
709 (pindex + object->pg_color) & PQ_L2_MASK,
710 prefer_zero
711 );
712 return(m);
713 }
714
715 /*
716 * vm_page_alloc:
717 *
718 * Allocate and return a memory cell associated
719 * with this VM object/offset pair.
720 *
721 * page_req classes:
722 * VM_ALLOC_NORMAL normal process request
723 * VM_ALLOC_SYSTEM system *really* needs a page
724 * VM_ALLOC_INTERRUPT interrupt time request
725 * VM_ALLOC_ZERO zero page
726 *
727 * Object must be locked.
728 * This routine may not block.
729 *
730 * Additional special handling is required when called from an
731 * interrupt (VM_ALLOC_INTERRUPT). We are not allowed to mess with
732 * the page cache in this case.
733 */
734
735 vm_page_t
736 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req)
737 {
738 vm_page_t m = NULL;
739 int s;
740
741 KASSERT(!vm_page_lookup(object, pindex),
742 ("vm_page_alloc: page already allocated"));
743
744 /*
745 * The pager is allowed to eat deeper into the free page list.
746 */
747
748 if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT)) {
749 page_req = VM_ALLOC_SYSTEM;
750 };
751
752 s = splvm();
753
754 loop:
755 if (cnt.v_free_count > cnt.v_free_reserved) {
756 /*
757 * Allocate from the free queue if there are plenty of pages
758 * in it.
759 */
760 if (page_req == VM_ALLOC_ZERO)
761 m = vm_page_select_free(object, pindex, TRUE);
762 else
763 m = vm_page_select_free(object, pindex, FALSE);
764 } else if (
765 (page_req == VM_ALLOC_SYSTEM &&
766 cnt.v_cache_count == 0 &&
767 cnt.v_free_count > cnt.v_interrupt_free_min) ||
768 (page_req == VM_ALLOC_INTERRUPT && cnt.v_free_count > 0)
769 ) {
770 /*
771 * Interrupt or system, dig deeper into the free list.
772 */
773 m = vm_page_select_free(object, pindex, FALSE);
774 } else if (page_req != VM_ALLOC_INTERRUPT) {
775 /*
776 * Allocatable from cache (non-interrupt only). On success,
777 * we must free the page and try again, thus ensuring that
778 * cnt.v_*_free_min counters are replenished.
779 */
780 m = vm_page_select_cache(object, pindex);
781 if (m == NULL) {
782 splx(s);
783 #if defined(DIAGNOSTIC)
784 if (cnt.v_cache_count > 0)
785 printf("vm_page_alloc(NORMAL): missing pages on cache queue: %d\n", cnt.v_cache_count);
786 #endif
787 vm_pageout_deficit++;
788 pagedaemon_wakeup();
789 return (NULL);
790 }
791 KASSERT(m->dirty == 0, ("Found dirty cache page %p", m));
792 vm_page_busy(m);
793 vm_page_protect(m, VM_PROT_NONE);
794 vm_page_free(m);
795 goto loop;
796 } else {
797 /*
798 * Not allocatable from cache from interrupt, give up.
799 */
800 splx(s);
801 vm_pageout_deficit++;
802 pagedaemon_wakeup();
803 return (NULL);
804 }
805
806 /*
807 * At this point we had better have found a good page.
808 */
809
810 KASSERT(
811 m != NULL,
812 ("vm_page_alloc(): missing page on free queue\n")
813 );
814
815 /*
816 * Remove from free queue
817 */
818
819 vm_page_unqueue_nowakeup(m);
820
821 /*
822 * Initialize structure. Only the PG_ZERO flag is inherited.
823 */
824
825 if (m->flags & PG_ZERO) {
826 vm_page_zero_count--;
827 m->flags = PG_ZERO | PG_BUSY;
828 } else {
829 m->flags = PG_BUSY;
830 }
831 m->wire_count = 0;
832 m->hold_count = 0;
833 m->act_count = 0;
834 m->busy = 0;
835 m->valid = 0;
836 KASSERT(m->dirty == 0, ("vm_page_alloc: free/cache page %p was dirty", m));
837
838 /*
839 * vm_page_insert() is safe prior to the splx(). Note also that
840 * inserting a page here does not insert it into the pmap (which
841 * could cause us to block allocating memory). We cannot block
842 * anywhere.
843 */
844
845 vm_page_insert(m, object, pindex);
846
847 /*
848 * Don't wakeup too often - wakeup the pageout daemon when
849 * we would be nearly out of memory.
850 */
851 if (vm_paging_needed())
852 pagedaemon_wakeup();
853
854 splx(s);
855
856 return (m);
857 }
858
859 /*
860 * vm_wait: (also see VM_WAIT macro)
861 *
862 * Block until free pages are available for allocation
863 * - Called in various places before memory allocations.
864 */
865
866 void
867 vm_wait(void)
868 {
869 int s;
870
871 s = splvm();
872 if (curproc == pageproc) {
873 vm_pageout_pages_needed = 1;
874 tsleep(&vm_pageout_pages_needed, PSWP, "VMWait", 0);
875 } else {
876 if (!vm_pages_needed) {
877 vm_pages_needed = 1;
878 wakeup(&vm_pages_needed);
879 }
880 tsleep(&cnt.v_free_count, PVM, "vmwait", 0);
881 }
882 splx(s);
883 }
884
885 /*
886 * vm_waitpfault: (also see VM_WAITPFAULT macro)
887 *
888 * Block until free pages are available for allocation
889 * - Called only in vm_fault so that processes page faulting
890 * can be easily tracked.
891 * - Sleeps at a lower priority than vm_wait() so that vm_wait()ing
892 * processes will be able to grab memory first. Do not change
893 * this balance without careful testing first.
894 */
895
896 void
897 vm_waitpfault(void)
898 {
899 int s;
900
901 s = splvm();
902 if (!vm_pages_needed) {
903 vm_pages_needed = 1;
904 wakeup(&vm_pages_needed);
905 }
906 tsleep(&cnt.v_free_count, PUSER, "pfault", 0);
907 splx(s);
908 }
909
910 /*
911 * vm_await: (also see VM_AWAIT macro)
912 *
913 * asleep on an event that will signal when free pages are available
914 * for allocation.
915 */
916
917 void
918 vm_await(void)
919 {
920 int s;
921
922 s = splvm();
923 if (curproc == pageproc) {
924 vm_pageout_pages_needed = 1;
925 asleep(&vm_pageout_pages_needed, PSWP, "vmwait", 0);
926 } else {
927 if (!vm_pages_needed) {
928 vm_pages_needed++;
929 wakeup(&vm_pages_needed);
930 }
931 asleep(&cnt.v_free_count, PVM, "vmwait", 0);
932 }
933 splx(s);
934 }
935
936 #if 0
937 /*
938 * vm_page_sleep:
939 *
940 * Block until page is no longer busy.
941 */
942
943 int
944 vm_page_sleep(vm_page_t m, char *msg, char *busy)
945 {
946 int slept = 0;
947 if ((busy && *busy) || (m->flags & PG_BUSY)) {
948 int s;
949 s = splvm();
950 if ((busy && *busy) || (m->flags & PG_BUSY)) {
951 vm_page_flag_set(m, PG_WANTED);
952 tsleep(m, PVM, msg, 0);
953 slept = 1;
954 }
955 splx(s);
956 }
957 return slept;
958 }
959
960 #endif
961
962 #if 0
963
964 /*
965 * vm_page_asleep:
966 *
967 * Similar to vm_page_sleep(), but does not block. Returns 0 if
968 * the page is not busy, or 1 if the page is busy.
969 *
970 * This routine has the side effect of calling asleep() if the page
971 * was busy (1 returned).
972 */
973
974 int
975 vm_page_asleep(vm_page_t m, char *msg, char *busy)
976 {
977 int slept = 0;
978 if ((busy && *busy) || (m->flags & PG_BUSY)) {
979 int s;
980 s = splvm();
981 if ((busy && *busy) || (m->flags & PG_BUSY)) {
982 vm_page_flag_set(m, PG_WANTED);
983 asleep(m, PVM, msg, 0);
984 slept = 1;
985 }
986 splx(s);
987 }
988 return slept;
989 }
990
991 #endif
992
993 /*
994 * vm_page_activate:
995 *
996 * Put the specified page on the active list (if appropriate).
997 * Ensure that act_count is at least ACT_INIT but do not otherwise
998 * mess with it.
999 *
1000 * The page queues must be locked.
1001 * This routine may not block.
1002 */
1003 void
1004 vm_page_activate(vm_page_t m)
1005 {
1006 int s;
1007
1008 s = splvm();
1009 if (m->queue != PQ_ACTIVE) {
1010 if ((m->queue - m->pc) == PQ_CACHE)
1011 cnt.v_reactivated++;
1012
1013 vm_page_unqueue(m);
1014
1015 if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
1016 m->queue = PQ_ACTIVE;
1017 vm_page_queues[PQ_ACTIVE].lcnt++;
1018 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1019 if (m->act_count < ACT_INIT)
1020 m->act_count = ACT_INIT;
1021 cnt.v_active_count++;
1022 }
1023 } else {
1024 if (m->act_count < ACT_INIT)
1025 m->act_count = ACT_INIT;
1026 }
1027
1028 splx(s);
1029 }
1030
1031 /*
1032 * vm_page_free_wakeup:
1033 *
1034 * Helper routine for vm_page_free_toq() and vm_page_cache(). This
1035 * routine is called when a page has been added to the cache or free
1036 * queues.
1037 *
1038 * This routine may not block.
1039 * This routine must be called at splvm()
1040 */
1041 static __inline void
1042 vm_page_free_wakeup(void)
1043 {
1044 /*
1045 * if pageout daemon needs pages, then tell it that there are
1046 * some free.
1047 */
1048 if (vm_pageout_pages_needed &&
1049 cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) {
1050 wakeup(&vm_pageout_pages_needed);
1051 vm_pageout_pages_needed = 0;
1052 }
1053 /*
1054 * wakeup processes that are waiting on memory if we hit a
1055 * high water mark. And wakeup scheduler process if we have
1056 * lots of memory. this process will swapin processes.
1057 */
1058 if (vm_pages_needed && !vm_page_count_min()) {
1059 vm_pages_needed = 0;
1060 wakeup(&cnt.v_free_count);
1061 }
1062 }
1063
1064 /*
1065 * vm_page_free_toq:
1066 *
1067 * Returns the given page to the PQ_FREE list,
1068 * disassociating it with any VM object.
1069 *
1070 * Object and page must be locked prior to entry.
1071 * This routine may not block.
1072 */
1073
1074 void
1075 vm_page_free_toq(vm_page_t m)
1076 {
1077 int s;
1078 struct vpgqueues *pq;
1079 vm_object_t object = m->object;
1080
1081 s = splvm();
1082
1083 cnt.v_tfree++;
1084
1085 if (m->busy || ((m->queue - m->pc) == PQ_FREE)) {
1086 printf(
1087 "vm_page_free: pindex(%lu), busy(%d), PG_BUSY(%d), hold(%d)\n",
1088 (u_long)m->pindex, m->busy, (m->flags & PG_BUSY) ? 1 : 0,
1089 m->hold_count);
1090 if ((m->queue - m->pc) == PQ_FREE)
1091 panic("vm_page_free: freeing free page");
1092 else
1093 panic("vm_page_free: freeing busy page");
1094 }
1095
1096 /*
1097 * unqueue, then remove page. Note that we cannot destroy
1098 * the page here because we do not want to call the pager's
1099 * callback routine until after we've put the page on the
1100 * appropriate free queue.
1101 */
1102
1103 vm_page_unqueue_nowakeup(m);
1104 vm_page_remove(m);
1105
1106 /*
1107 * If fictitious remove object association and
1108 * return, otherwise delay object association removal.
1109 */
1110
1111 if ((m->flags & PG_FICTITIOUS) != 0) {
1112 splx(s);
1113 return;
1114 }
1115
1116 m->valid = 0;
1117 vm_page_undirty(m);
1118
1119 if (m->wire_count != 0) {
1120 if (m->wire_count > 1) {
1121 panic("vm_page_free: invalid wire count (%d), pindex: 0x%lx",
1122 m->wire_count, (long)m->pindex);
1123 }
1124 panic("vm_page_free: freeing wired page\n");
1125 }
1126
1127 /*
1128 * If we've exhausted the object's resident pages we want to free
1129 * it up.
1130 */
1131
1132 if (object &&
1133 (object->type == OBJT_VNODE) &&
1134 ((object->flags & OBJ_DEAD) == 0)
1135 ) {
1136 struct vnode *vp = (struct vnode *)object->handle;
1137
1138 if (vp && VSHOULDFREE(vp))
1139 vfree(vp);
1140 }
1141
1142 /*
1143 * Clear the UNMANAGED flag when freeing an unmanaged page.
1144 */
1145
1146 if (m->flags & PG_UNMANAGED) {
1147 m->flags &= ~PG_UNMANAGED;
1148 } else {
1149 #ifdef __alpha__
1150 pmap_page_is_free(m);
1151 #endif
1152 }
1153
1154 if (m->hold_count != 0) {
1155 m->flags &= ~PG_ZERO;
1156 m->queue = PQ_HOLD;
1157 } else
1158 m->queue = PQ_FREE + m->pc;
1159 pq = &vm_page_queues[m->queue];
1160 pq->lcnt++;
1161 ++(*pq->cnt);
1162
1163 /*
1164 * Put zero'd pages on the end ( where we look for zero'd pages
1165 * first ) and non-zerod pages at the head.
1166 */
1167
1168 if (m->flags & PG_ZERO) {
1169 TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
1170 ++vm_page_zero_count;
1171 } else {
1172 TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
1173 }
1174
1175 vm_page_free_wakeup();
1176
1177 splx(s);
1178 }
1179
1180 /*
1181 * vm_page_unmanage:
1182 *
1183 * Prevent PV management from being done on the page. The page is
1184 * removed from the paging queues as if it were wired, and as a
1185 * consequence of no longer being managed the pageout daemon will not
1186 * touch it (since there is no way to locate the pte mappings for the
1187 * page). madvise() calls that mess with the pmap will also no longer
1188 * operate on the page.
1189 *
1190 * Beyond that the page is still reasonably 'normal'. Freeing the page
1191 * will clear the flag.
1192 *
1193 * This routine is used by OBJT_PHYS objects - objects using unswappable
1194 * physical memory as backing store rather then swap-backed memory and
1195 * will eventually be extended to support 4MB unmanaged physical
1196 * mappings.
1197 */
1198
1199 void
1200 vm_page_unmanage(vm_page_t m)
1201 {
1202 int s;
1203
1204 s = splvm();
1205 if ((m->flags & PG_UNMANAGED) == 0) {
1206 if (m->wire_count == 0)
1207 vm_page_unqueue(m);
1208 }
1209 vm_page_flag_set(m, PG_UNMANAGED);
1210 splx(s);
1211 }
1212
1213 /*
1214 * vm_page_wire:
1215 *
1216 * Mark this page as wired down by yet
1217 * another map, removing it from paging queues
1218 * as necessary.
1219 *
1220 * The page queues must be locked.
1221 * This routine may not block.
1222 */
1223 void
1224 vm_page_wire(vm_page_t m)
1225 {
1226 int s;
1227
1228 if (m->flags & PG_FICTITIOUS)
1229 return;
1230 /*
1231 * Only bump the wire statistics if the page is not already wired,
1232 * and only unqueue the page if it is on some queue (if it is unmanaged
1233 * it is already off the queues).
1234 */
1235 s = splvm();
1236 if (m->wire_count == 0) {
1237 if ((m->flags & PG_UNMANAGED) == 0)
1238 vm_page_unqueue(m);
1239 cnt.v_wire_count++;
1240 }
1241 m->wire_count++;
1242 KASSERT(m->wire_count != 0,
1243 ("vm_page_wire: wire_count overflow m=%p", m));
1244
1245 splx(s);
1246 vm_page_flag_set(m, PG_MAPPED);
1247 }
1248
1249 /*
1250 * vm_page_unwire:
1251 *
1252 * Release one wiring of this page, potentially
1253 * enabling it to be paged again.
1254 *
1255 * Many pages placed on the inactive queue should actually go
1256 * into the cache, but it is difficult to figure out which. What
1257 * we do instead, if the inactive target is well met, is to put
1258 * clean pages at the head of the inactive queue instead of the tail.
1259 * This will cause them to be moved to the cache more quickly and
1260 * if not actively re-referenced, freed more quickly. If we just
1261 * stick these pages at the end of the inactive queue, heavy filesystem
1262 * meta-data accesses can cause an unnecessary paging load on memory bound
1263 * processes. This optimization causes one-time-use metadata to be
1264 * reused more quickly.
1265 *
1266 * BUT, if we are in a low-memory situation we have no choice but to
1267 * put clean pages on the cache queue.
1268 *
1269 * A number of routines use vm_page_unwire() to guarantee that the page
1270 * will go into either the inactive or active queues, and will NEVER
1271 * be placed in the cache - for example, just after dirtying a page.
1272 * dirty pages in the cache are not allowed.
1273 *
1274 * The page queues must be locked.
1275 * This routine may not block.
1276 */
1277 void
1278 vm_page_unwire(vm_page_t m, int activate)
1279 {
1280 int s;
1281
1282 if (m->flags & PG_FICTITIOUS)
1283 return;
1284 s = splvm();
1285
1286 if (m->wire_count > 0) {
1287 m->wire_count--;
1288 if (m->wire_count == 0) {
1289 cnt.v_wire_count--;
1290 if (m->flags & PG_UNMANAGED) {
1291 ;
1292 } else if (activate) {
1293 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE].pl, m, pageq);
1294 m->queue = PQ_ACTIVE;
1295 vm_page_queues[PQ_ACTIVE].lcnt++;
1296 cnt.v_active_count++;
1297 } else {
1298 vm_page_flag_clear(m, PG_WINATCFLS);
1299 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
1300 m->queue = PQ_INACTIVE;
1301 vm_page_queues[PQ_INACTIVE].lcnt++;
1302 cnt.v_inactive_count++;
1303 }
1304 }
1305 } else {
1306 panic("vm_page_unwire: invalid wire count: %d\n", m->wire_count);
1307 }
1308 splx(s);
1309 }
1310
1311
1312 /*
1313 * Move the specified page to the inactive queue. If the page has
1314 * any associated swap, the swap is deallocated.
1315 *
1316 * Normally athead is 0 resulting in LRU operation. athead is set
1317 * to 1 if we want this page to be 'as if it were placed in the cache',
1318 * except without unmapping it from the process address space.
1319 *
1320 * This routine may not block.
1321 */
1322 static __inline void
1323 _vm_page_deactivate(vm_page_t m, int athead)
1324 {
1325 int s;
1326
1327 /*
1328 * Ignore if already inactive.
1329 */
1330 if (m->queue == PQ_INACTIVE)
1331 return;
1332
1333 s = splvm();
1334 if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
1335 if ((m->queue - m->pc) == PQ_CACHE)
1336 cnt.v_reactivated++;
1337 vm_page_flag_clear(m, PG_WINATCFLS);
1338 vm_page_unqueue(m);
1339 if (athead)
1340 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
1341 else
1342 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
1343 m->queue = PQ_INACTIVE;
1344 vm_page_queues[PQ_INACTIVE].lcnt++;
1345 cnt.v_inactive_count++;
1346 }
1347 splx(s);
1348 }
1349
1350 void
1351 vm_page_deactivate(vm_page_t m)
1352 {
1353 _vm_page_deactivate(m, 0);
1354 }
1355
1356 /*
1357 * vm_page_try_to_cache:
1358 *
1359 * Returns 0 on failure, 1 on success
1360 */
1361 int
1362 vm_page_try_to_cache(vm_page_t m)
1363 {
1364 if (m->dirty || m->hold_count || m->busy || m->wire_count ||
1365 (m->flags & (PG_BUSY|PG_UNMANAGED))) {
1366 return(0);
1367 }
1368 vm_page_test_dirty(m);
1369 if (m->dirty)
1370 return(0);
1371 vm_page_cache(m);
1372 return(1);
1373 }
1374
1375 /*
1376 * vm_page_try_to_free()
1377 *
1378 * Attempt to free the page. If we cannot free it, we do nothing.
1379 * 1 is returned on success, 0 on failure.
1380 */
1381
1382 int
1383 vm_page_try_to_free(vm_page_t m)
1384 {
1385 if (m->dirty || m->hold_count || m->busy || m->wire_count ||
1386 (m->flags & (PG_BUSY|PG_UNMANAGED))) {
1387 return(0);
1388 }
1389 vm_page_test_dirty(m);
1390 if (m->dirty)
1391 return(0);
1392 vm_page_busy(m);
1393 vm_page_protect(m, VM_PROT_NONE);
1394 vm_page_free(m);
1395 return(1);
1396 }
1397
1398
1399 /*
1400 * vm_page_cache
1401 *
1402 * Put the specified page onto the page cache queue (if appropriate).
1403 *
1404 * This routine may not block.
1405 */
1406 void
1407 vm_page_cache(vm_page_t m)
1408 {
1409 int s;
1410
1411 if ((m->flags & (PG_BUSY|PG_UNMANAGED)) || m->busy ||
1412 m->hold_count || m->wire_count) {
1413 printf("vm_page_cache: attempting to cache busy page\n");
1414 return;
1415 }
1416 if ((m->queue - m->pc) == PQ_CACHE)
1417 return;
1418
1419 /*
1420 * Remove all pmaps and indicate that the page is not
1421 * writeable or mapped.
1422 */
1423
1424 vm_page_protect(m, VM_PROT_NONE);
1425 if (m->dirty != 0) {
1426 panic("vm_page_cache: caching a dirty page, pindex: %ld",
1427 (long)m->pindex);
1428 }
1429 s = splvm();
1430 vm_page_unqueue_nowakeup(m);
1431 m->queue = PQ_CACHE + m->pc;
1432 vm_page_queues[m->queue].lcnt++;
1433 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq);
1434 cnt.v_cache_count++;
1435 vm_page_free_wakeup();
1436 splx(s);
1437 }
1438
1439 /*
1440 * vm_page_dontneed
1441 *
1442 * Cache, deactivate, or do nothing as appropriate. This routine
1443 * is typically used by madvise() MADV_DONTNEED.
1444 *
1445 * Generally speaking we want to move the page into the cache so
1446 * it gets reused quickly. However, this can result in a silly syndrome
1447 * due to the page recycling too quickly. Small objects will not be
1448 * fully cached. On the otherhand, if we move the page to the inactive
1449 * queue we wind up with a problem whereby very large objects
1450 * unnecessarily blow away our inactive and cache queues.
1451 *
1452 * The solution is to move the pages based on a fixed weighting. We
1453 * either leave them alone, deactivate them, or move them to the cache,
1454 * where moving them to the cache has the highest weighting.
1455 * By forcing some pages into other queues we eventually force the
1456 * system to balance the queues, potentially recovering other unrelated
1457 * space from active. The idea is to not force this to happen too
1458 * often.
1459 */
1460
1461 void
1462 vm_page_dontneed(vm_page_t m)
1463 {
1464 static int dnweight;
1465 int dnw;
1466 int head;
1467
1468 dnw = ++dnweight;
1469
1470 /*
1471 * occassionally leave the page alone
1472 */
1473
1474 if ((dnw & 0x01F0) == 0 ||
1475 m->queue == PQ_INACTIVE ||
1476 m->queue - m->pc == PQ_CACHE
1477 ) {
1478 if (m->act_count >= ACT_INIT)
1479 --m->act_count;
1480 return;
1481 }
1482
1483 if (m->dirty == 0)
1484 vm_page_test_dirty(m);
1485
1486 if (m->dirty || (dnw & 0x0070) == 0) {
1487 /*
1488 * Deactivate the page 3 times out of 32.
1489 */
1490 head = 0;
1491 } else {
1492 /*
1493 * Cache the page 28 times out of every 32. Note that
1494 * the page is deactivated instead of cached, but placed
1495 * at the head of the queue instead of the tail.
1496 */
1497 head = 1;
1498 }
1499 _vm_page_deactivate(m, head);
1500 }
1501
1502 /*
1503 * Grab a page, waiting until we are waken up due to the page
1504 * changing state. We keep on waiting, if the page continues
1505 * to be in the object. If the page doesn't exist, allocate it.
1506 *
1507 * This routine may block.
1508 */
1509 vm_page_t
1510 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
1511 {
1512
1513 vm_page_t m;
1514 int s, generation;
1515
1516 retrylookup:
1517 if ((m = vm_page_lookup(object, pindex)) != NULL) {
1518 if (m->busy || (m->flags & PG_BUSY)) {
1519 generation = object->generation;
1520
1521 s = splvm();
1522 while ((object->generation == generation) &&
1523 (m->busy || (m->flags & PG_BUSY))) {
1524 vm_page_flag_set(m, PG_WANTED | PG_REFERENCED);
1525 tsleep(m, PVM, "pgrbwt", 0);
1526 if ((allocflags & VM_ALLOC_RETRY) == 0) {
1527 splx(s);
1528 return NULL;
1529 }
1530 }
1531 splx(s);
1532 goto retrylookup;
1533 } else {
1534 vm_page_busy(m);
1535 return m;
1536 }
1537 }
1538
1539 m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_RETRY);
1540 if (m == NULL) {
1541 VM_WAIT;
1542 if ((allocflags & VM_ALLOC_RETRY) == 0)
1543 return NULL;
1544 goto retrylookup;
1545 }
1546
1547 return m;
1548 }
1549
1550 /*
1551 * Mapping function for valid bits or for dirty bits in
1552 * a page. May not block.
1553 *
1554 * Inputs are required to range within a page.
1555 */
1556
1557 __inline int
1558 vm_page_bits(int base, int size)
1559 {
1560 int first_bit;
1561 int last_bit;
1562
1563 KASSERT(
1564 base + size <= PAGE_SIZE,
1565 ("vm_page_bits: illegal base/size %d/%d", base, size)
1566 );
1567
1568 if (size == 0) /* handle degenerate case */
1569 return(0);
1570
1571 first_bit = base >> DEV_BSHIFT;
1572 last_bit = (base + size - 1) >> DEV_BSHIFT;
1573
1574 return ((2 << last_bit) - (1 << first_bit));
1575 }
1576
1577 /*
1578 * vm_page_set_validclean:
1579 *
1580 * Sets portions of a page valid and clean. The arguments are expected
1581 * to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
1582 * of any partial chunks touched by the range. The invalid portion of
1583 * such chunks will be zero'd.
1584 *
1585 * This routine may not block.
1586 *
1587 * (base + size) must be less then or equal to PAGE_SIZE.
1588 */
1589 void
1590 vm_page_set_validclean(vm_page_t m, int base, int size)
1591 {
1592 int pagebits;
1593 int frag;
1594 int endoff;
1595
1596 if (size == 0) /* handle degenerate case */
1597 return;
1598
1599 /*
1600 * If the base is not DEV_BSIZE aligned and the valid
1601 * bit is clear, we have to zero out a portion of the
1602 * first block.
1603 */
1604
1605 if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
1606 (m->valid & (1 << (base >> DEV_BSHIFT))) == 0
1607 ) {
1608 pmap_zero_page_area(
1609 VM_PAGE_TO_PHYS(m),
1610 frag,
1611 base - frag
1612 );
1613 }
1614
1615 /*
1616 * If the ending offset is not DEV_BSIZE aligned and the
1617 * valid bit is clear, we have to zero out a portion of
1618 * the last block.
1619 */
1620
1621 endoff = base + size;
1622
1623 if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
1624 (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0
1625 ) {
1626 pmap_zero_page_area(
1627 VM_PAGE_TO_PHYS(m),
1628 endoff,
1629 DEV_BSIZE - (endoff & (DEV_BSIZE - 1))
1630 );
1631 }
1632
1633 /*
1634 * Set valid, clear dirty bits. If validating the entire
1635 * page we can safely clear the pmap modify bit. We also
1636 * use this opportunity to clear the PG_NOSYNC flag. If a process
1637 * takes a write fault on a MAP_NOSYNC memory area the flag will
1638 * be set again.
1639 *
1640 * We set valid bits inclusive of any overlap, but we can only
1641 * clear dirty bits for DEV_BSIZE chunks that are fully within
1642 * the range.
1643 */
1644
1645 pagebits = vm_page_bits(base, size);
1646 m->valid |= pagebits;
1647 #if 0 /* NOT YET */
1648 if ((frag = base & (DEV_BSIZE - 1)) != 0) {
1649 frag = DEV_BSIZE - frag;
1650 base += frag;
1651 size -= frag;
1652 if (size < 0)
1653 size = 0;
1654 }
1655 pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
1656 #endif
1657 m->dirty &= ~pagebits;
1658 if (base == 0 && size == PAGE_SIZE) {
1659 pmap_clear_modify(m);
1660 vm_page_flag_clear(m, PG_NOSYNC);
1661 }
1662 }
1663
1664 #if 0
1665
1666 void
1667 vm_page_set_dirty(vm_page_t m, int base, int size)
1668 {
1669 m->dirty |= vm_page_bits(base, size);
1670 }
1671
1672 #endif
1673
1674 void
1675 vm_page_clear_dirty(vm_page_t m, int base, int size)
1676 {
1677 m->dirty &= ~vm_page_bits(base, size);
1678 }
1679
1680 /*
1681 * vm_page_set_invalid:
1682 *
1683 * Invalidates DEV_BSIZE'd chunks within a page. Both the
1684 * valid and dirty bits for the effected areas are cleared.
1685 *
1686 * May not block.
1687 */
1688 void
1689 vm_page_set_invalid(vm_page_t m, int base, int size)
1690 {
1691 int bits;
1692
1693 bits = vm_page_bits(base, size);
1694 m->valid &= ~bits;
1695 m->dirty &= ~bits;
1696 m->object->generation++;
1697 }
1698
1699 /*
1700 * vm_page_zero_invalid()
1701 *
1702 * The kernel assumes that the invalid portions of a page contain
1703 * garbage, but such pages can be mapped into memory by user code.
1704 * When this occurs, we must zero out the non-valid portions of the
1705 * page so user code sees what it expects.
1706 *
1707 * Pages are most often semi-valid when the end of a file is mapped
1708 * into memory and the file's size is not page aligned.
1709 */
1710
1711 void
1712 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
1713 {
1714 int b;
1715 int i;
1716
1717 /*
1718 * Scan the valid bits looking for invalid sections that
1719 * must be zerod. Invalid sub-DEV_BSIZE'd areas ( where the
1720 * valid bit may be set ) have already been zerod by
1721 * vm_page_set_validclean().
1722 */
1723
1724 for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
1725 if (i == (PAGE_SIZE / DEV_BSIZE) ||
1726 (m->valid & (1 << i))
1727 ) {
1728 if (i > b) {
1729 pmap_zero_page_area(
1730 VM_PAGE_TO_PHYS(m),
1731 b << DEV_BSHIFT,
1732 (i - b) << DEV_BSHIFT
1733 );
1734 }
1735 b = i + 1;
1736 }
1737 }
1738
1739 /*
1740 * setvalid is TRUE when we can safely set the zero'd areas
1741 * as being valid. We can do this if there are no cache consistency
1742 * issues. e.g. it is ok to do with UFS, but not ok to do with NFS.
1743 */
1744
1745 if (setvalid)
1746 m->valid = VM_PAGE_BITS_ALL;
1747 }
1748
1749 /*
1750 * vm_page_is_valid:
1751 *
1752 * Is (partial) page valid? Note that the case where size == 0
1753 * will return FALSE in the degenerate case where the page is
1754 * entirely invalid, and TRUE otherwise.
1755 *
1756 * May not block.
1757 */
1758
1759 int
1760 vm_page_is_valid(vm_page_t m, int base, int size)
1761 {
1762 int bits = vm_page_bits(base, size);
1763
1764 if (m->valid && ((m->valid & bits) == bits))
1765 return 1;
1766 else
1767 return 0;
1768 }
1769
1770 /*
1771 * update dirty bits from pmap/mmu. May not block.
1772 */
1773
1774 void
1775 vm_page_test_dirty(vm_page_t m)
1776 {
1777 if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(m)) {
1778 vm_page_dirty(m);
1779 }
1780 }
1781
1782 /*
1783 * This interface is for merging with malloc() someday.
1784 * Even if we never implement compaction so that contiguous allocation
1785 * works after initialization time, malloc()'s data structures are good
1786 * for statistics and for allocations of less than a page.
1787 */
1788 void *
1789 contigmalloc1(
1790 unsigned long size, /* should be size_t here and for malloc() */
1791 struct malloc_type *type,
1792 int flags,
1793 vm_paddr_t low,
1794 vm_paddr_t high,
1795 unsigned long alignment,
1796 unsigned long boundary,
1797 vm_map_t map)
1798 {
1799 int i, s, start;
1800 vm_offset_t addr, tmp_addr;
1801 vm_paddr_t phys;
1802 int pass;
1803 vm_page_t pga = vm_page_array;
1804 vm_page_t m;
1805 int pqtype;
1806
1807 size = round_page(size);
1808 if (size == 0)
1809 panic("contigmalloc1: size must not be 0");
1810 if ((alignment & (alignment - 1)) != 0)
1811 panic("contigmalloc1: alignment must be a power of 2");
1812 if ((boundary & (boundary - 1)) != 0)
1813 panic("contigmalloc1: boundary must be a power of 2");
1814
1815 start = 0;
1816 for (pass = 0; pass <= 1; pass++) {
1817 s = splvm();
1818 again:
1819 /*
1820 * Find first page in array that is free, within range, aligned, and
1821 * such that the boundary won't be crossed.
1822 */
1823 for (i = start; i < cnt.v_page_count; i++) {
1824 m = &pga[i];
1825 phys = VM_PAGE_TO_PHYS(m);
1826 pqtype = m->queue - m->pc;
1827 if (((pqtype == PQ_FREE) || (pqtype == PQ_CACHE)) &&
1828 (phys >= low) && (phys < high) &&
1829 ((phys & (alignment - 1)) == 0) &&
1830 (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0) &&
1831 m->busy == 0 && m->wire_count == 0 &&
1832 m->hold_count == 0 && (m->flags & PG_BUSY) == 0) {
1833 break;
1834 }
1835 }
1836
1837 /*
1838 * If the above failed or we will exceed the upper bound, fail.
1839 */
1840 if ((i == cnt.v_page_count) ||
1841 ((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) {
1842 vm_page_t m, next;
1843
1844 again1:
1845 for (m = TAILQ_FIRST(&vm_page_queues[PQ_INACTIVE].pl);
1846 m != NULL;
1847 m = next) {
1848
1849 next = TAILQ_NEXT(m, pageq);
1850
1851 /* Skip marker pages */
1852 if ((m->flags & PG_MARKER) != 0)
1853 continue;
1854
1855 KASSERT(m->queue == PQ_INACTIVE,
1856 ("contigmalloc1: page %p is not PQ_INACTIVE", m));
1857
1858 if (vm_page_sleep_busy(m, TRUE, "vpctw0"))
1859 goto again1;
1860 vm_page_test_dirty(m);
1861 if (m->dirty) {
1862 if (m->object->type == OBJT_VNODE) {
1863 vn_lock(m->object->handle, LK_EXCLUSIVE | LK_RETRY, curproc);
1864 vm_object_page_clean(m->object, 0, 0, OBJPC_SYNC);
1865 VOP_UNLOCK(m->object->handle, 0, curproc);
1866 goto again1;
1867 } else if (m->object->type == OBJT_SWAP ||
1868 m->object->type == OBJT_DEFAULT) {
1869 vm_pageout_flush(&m, 1, 0);
1870 goto again1;
1871 }
1872 }
1873 if ((m->dirty == 0) && (m->busy == 0) && (m->hold_count == 0))
1874 vm_page_cache(m);
1875 }
1876
1877 for (m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
1878 m != NULL;
1879 m = next) {
1880
1881 KASSERT(m->queue == PQ_ACTIVE,
1882 ("contigmalloc1: page %p is not PQ_ACTIVE", m));
1883
1884 next = TAILQ_NEXT(m, pageq);
1885 if (vm_page_sleep_busy(m, TRUE, "vpctw1"))
1886 goto again1;
1887 vm_page_test_dirty(m);
1888 if (m->dirty) {
1889 if (m->object->type == OBJT_VNODE) {
1890 vn_lock(m->object->handle, LK_EXCLUSIVE | LK_RETRY, curproc);
1891 vm_object_page_clean(m->object, 0, 0, OBJPC_SYNC);
1892 VOP_UNLOCK(m->object->handle, 0, curproc);
1893 goto again1;
1894 } else if (m->object->type == OBJT_SWAP ||
1895 m->object->type == OBJT_DEFAULT) {
1896 vm_pageout_flush(&m, 1, 0);
1897 goto again1;
1898 }
1899 }
1900 if ((m->dirty == 0) && (m->busy == 0) && (m->hold_count == 0))
1901 vm_page_cache(m);
1902 }
1903
1904 splx(s);
1905 continue;
1906 }
1907 start = i;
1908
1909 /*
1910 * Check successive pages for contiguous and free.
1911 */
1912 for (i = start + 1; i < (start + size / PAGE_SIZE); i++) {
1913 m = &pga[i];
1914 pqtype = m->queue - m->pc;
1915 if ((VM_PAGE_TO_PHYS(&m[0]) !=
1916 (VM_PAGE_TO_PHYS(&m[-1]) + PAGE_SIZE)) ||
1917 ((pqtype != PQ_FREE) && (pqtype != PQ_CACHE)) ||
1918 m->busy || m->wire_count ||
1919 m->hold_count || (m->flags & PG_BUSY) ) {
1920 start++;
1921 goto again;
1922 }
1923 }
1924
1925 for (i = start; i < (start + size / PAGE_SIZE); i++) {
1926 m = &pga[i];
1927
1928 pqtype = m->queue - m->pc;
1929 if (pqtype == PQ_CACHE) {
1930 vm_page_busy(m);
1931 vm_page_free(m);
1932 }
1933 KASSERT((m->object == NULL), ("contigmalloc: object NULL"));
1934 vm_page_unqueue_nowakeup(m);
1935 m->valid = VM_PAGE_BITS_ALL;
1936 if (m->flags & PG_ZERO)
1937 vm_page_zero_count--;
1938 /* Don't clear the PG_ZERO flag, we'll need it later. */
1939 m->flags &= PG_ZERO;
1940 KASSERT(m->dirty == 0, ("contigmalloc1: page %p was dirty", m));
1941 m->wire_count = 0;
1942 m->busy = 0;
1943 }
1944
1945 /*
1946 * We've found a contiguous chunk that meets are requirements.
1947 * Allocate kernel VM, unfree and assign the physical pages to it and
1948 * return kernel VM pointer.
1949 */
1950 vm_map_lock(map);
1951 if (vm_map_findspace(map, vm_map_min(map), size, &addr) !=
1952 KERN_SUCCESS) {
1953 /*
1954 * XXX We almost never run out of kernel virtual
1955 * space, so we don't make the allocated memory
1956 * above available.
1957 */
1958 vm_map_unlock(map);
1959 splx(s);
1960 return (NULL);
1961 }
1962 vm_object_reference(kernel_object);
1963 vm_map_insert(map, kernel_object, addr - VM_MIN_KERNEL_ADDRESS,
1964 addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, 0);
1965 vm_map_unlock(map);
1966
1967 tmp_addr = addr;
1968 for (i = start; i < (start + size / PAGE_SIZE); i++) {
1969 vm_page_t m = &pga[i];
1970 vm_page_insert(m, kernel_object,
1971 OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS));
1972 if ((flags & M_ZERO) && !(m->flags & PG_ZERO))
1973 pmap_zero_page(VM_PAGE_TO_PHYS(m));
1974 m->flags = 0;
1975 tmp_addr += PAGE_SIZE;
1976 }
1977 vm_map_pageable(map, addr, addr + size, FALSE);
1978
1979 splx(s);
1980 return ((void *)addr);
1981 }
1982 return NULL;
1983 }
1984
1985 void *
1986 contigmalloc(
1987 unsigned long size, /* should be size_t here and for malloc() */
1988 struct malloc_type *type,
1989 int flags,
1990 vm_paddr_t low,
1991 vm_paddr_t high,
1992 unsigned long alignment,
1993 unsigned long boundary)
1994 {
1995 return contigmalloc1(size, type, flags, low, high, alignment, boundary,
1996 kernel_map);
1997 }
1998
1999 void
2000 contigfree(void *addr, unsigned long size, struct malloc_type *type)
2001 {
2002 kmem_free(kernel_map, (vm_offset_t)addr, size);
2003 }
2004
2005 vm_offset_t
2006 vm_page_alloc_contig(
2007 vm_offset_t size,
2008 vm_paddr_t low,
2009 vm_paddr_t high,
2010 vm_offset_t alignment)
2011 {
2012 return ((vm_offset_t)contigmalloc1(size, M_DEVBUF, M_NOWAIT, low, high,
2013 alignment, 0ul, kernel_map));
2014 }
2015
2016 #include "opt_ddb.h"
2017 #ifdef DDB
2018 #include <sys/kernel.h>
2019
2020 #include <ddb/ddb.h>
2021
2022 DB_SHOW_COMMAND(page, vm_page_print_page_info)
2023 {
2024 db_printf("cnt.v_free_count: %d\n", cnt.v_free_count);
2025 db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count);
2026 db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count);
2027 db_printf("cnt.v_active_count: %d\n", cnt.v_active_count);
2028 db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count);
2029 db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved);
2030 db_printf("cnt.v_free_min: %d\n", cnt.v_free_min);
2031 db_printf("cnt.v_free_target: %d\n", cnt.v_free_target);
2032 db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min);
2033 db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target);
2034 }
2035
2036 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
2037 {
2038 int i;
2039 db_printf("PQ_FREE:");
2040 for(i=0;i<PQ_L2_SIZE;i++) {
2041 db_printf(" %d", vm_page_queues[PQ_FREE + i].lcnt);
2042 }
2043 db_printf("\n");
2044
2045 db_printf("PQ_CACHE:");
2046 for(i=0;i<PQ_L2_SIZE;i++) {
2047 db_printf(" %d", vm_page_queues[PQ_CACHE + i].lcnt);
2048 }
2049 db_printf("\n");
2050
2051 db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
2052 vm_page_queues[PQ_ACTIVE].lcnt,
2053 vm_page_queues[PQ_INACTIVE].lcnt);
2054 }
2055 #endif /* DDB */
Cache object: 066a00373d6ae52843232f9f77c666c0
|